diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index f45a4d82150d50..e389df431505bd 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -22,7 +22,7 @@ set -o pipefail
 
 # Environment variables script works with:
 # List of files affected by this commit
-: ${MODIFIED_FILES:=$(git diff --name-only main...HEAD)}
+: ${MODIFIED_FILES:=$(git diff --name-only HEAD~1)}
 # Filter rules for generic windows tests
 : ${WINDOWS_AGENTS:='{"queue": "windows"}'}
 # Filter rules for generic linux tests
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ded7d9df068e0d..8fb4c7523a6cb3 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,32 @@
+# This file lists reviewers that are auto-assigned when a pull request modifies
+# certain files or directories. If you add yourself to this file, you commit to
+# reviewing a large fraction of pull requests in the relevant area.
+#
+# The GitHub "code owners" mechanism is used exclusively to auto-assign
+# reviewers and does not carry significance beyond that. It is not necessary
+# to receive an approval from a "code owner" in particular -- any LLVM project
+# member can approve pull requests.
+#
+# Note that GitHub's concept of "code owner" is independent from LLVM's own
+# "code owner" concept, they merely happen to share terminology. See
+# https://llvm.org/docs/DeveloperPolicy.html#code-owners, as well as the
+# CODE_OWNERS.txt files in the respective subproject directories.
+
 /libcxx/ @llvm/reviewers-libcxx
 /libcxxabi/ @llvm/reviewers-libcxxabi
 /libunwind/ @llvm/reviewers-libunwind
 /runtimes/ @llvm/reviewers-libcxx
+
+/llvm/lib/Analysis/BasicAliasAnalysis.cpp @nikic
+/llvm/lib/Analysis/InstructionSimplify.cpp @nikic
+/llvm/lib/Analysis/LazyValueInfo.cpp @nikic
+/llvm/lib/Analysis/ScalarEvolution.cpp @nikic
+/llvm/lib/Analysis/ValueTracking.cpp @nikic
+/llvm/lib/IR/ConstantRange.cpp @nikic
+/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @nikic
+/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @nikic
+/llvm/lib/Transforms/InstCombine/ @nikic
+
+/clang/test/CXX/drs/ @Endilll
+/clang/www/cxx_dr_status.html @Endilll
+/clang/www/make_cxx_dr_status @Endilll
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 428235b72fa5ad..cc9855ce182b2b 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -170,14 +170,17 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           name: build-baseline
+          path: build-baseline
       - name: Download latest
         uses: actions/download-artifact@v3
         with:
           name: build-latest
+          path: build-latest
       - name: Download symbol list
         uses: actions/download-artifact@v3
         with:
           name: symbol-list
+          path: symbol-list
 
       - name: Install abi-compliance-checker
         run: sudo apt-get install abi-compliance-checker
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 90b801b13ed179..e52e52f5d3f36f 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -68,7 +68,7 @@ jobs:
       matrix:
         target:
           - triple: x86_64-linux-gnu-ubuntu-22.04
-            runs-on: ubuntu-22.04-8x32
+            runs-on: ubuntu-22.04-16x64
             debian-build-deps: >
               chrpath
               gcc-multilib
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index ef57ff3541dc8c..59460105f23137 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -871,6 +871,15 @@ class BinaryContext {
     return nullptr;
   }
 
+  /// Retrieves a reference to ELF's _GLOBAL_OFFSET_TABLE_ symbol, which points
+  /// at GOT, or null if it is not present in the input binary symtab.
+  BinaryData *getGOTSymbol();
+
+  /// Checks if symbol name refers to ELF's _GLOBAL_OFFSET_TABLE_ symbol
+  bool isGOTSymbol(StringRef SymName) const {
+    return SymName == "_GLOBAL_OFFSET_TABLE_";
+  }
+
   /// Return true if \p SymbolName was generated internally and was not present
   /// in the input binary.
   bool isInternalSymbolName(const StringRef Name) {
diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h
index b4a72ac274fade..31cc9071de76ac 100644
--- a/bolt/include/bolt/Core/MCPlus.h
+++ b/bolt/include/bolt/Core/MCPlus.h
@@ -66,6 +66,7 @@ class MCAnnotation {
     kTailCall,            /// Tail call.
     kConditionalTailCall, /// CTC.
     kOffset,              /// Offset in the function.
+    kLabel,               /// MCSymbol pointing to this instruction.
     kGeneric              /// First generic annotation.
   };
 
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 5e12a4ac14c9e4..c9e0e5d599b1fa 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -160,6 +160,7 @@ class MCPlusBuilder {
   const MCInstrAnalysis *Analysis;
   const MCInstrInfo *Info;
   const MCRegisterInfo *RegInfo;
+  const MCSubtargetInfo *STI;
 
   /// Map annotation name into an annotation index.
   StringMap<uint64_t> AnnotationNameIndexMap;
@@ -331,8 +332,8 @@ class MCPlusBuilder {
 
 public:
   MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
-                const MCRegisterInfo *RegInfo)
-      : Analysis(Analysis), Info(Info), RegInfo(RegInfo) {
+                const MCRegisterInfo *RegInfo, const MCSubtargetInfo *STI)
+      : Analysis(Analysis), Info(Info), RegInfo(RegInfo), STI(STI) {
     // Initialize the default annotation allocator with id 0
     AnnotationAllocators.emplace(0, AnnotationAllocator());
     MaxAllocatorId++;
@@ -1179,6 +1180,13 @@ class MCPlusBuilder {
   /// Remove offset annotation.
   bool clearOffset(MCInst &Inst);
 
+  /// Return the label of \p Inst, if available.
+  std::optional<MCSymbol *> getLabel(const MCInst &Inst) const;
+
+  /// Set the label of \p Inst. This label will be emitted right before \p Inst
+  /// is emitted to MCStreamer.
+  bool setLabel(MCInst &Inst, MCSymbol *Label);
+
   /// Return MCSymbol that represents a target of this instruction at a given
   /// operand number \p OpNum. If there's no symbol associated with
   /// the operand - return nullptr.
@@ -2079,15 +2087,18 @@ class MCPlusBuilder {
 
 MCPlusBuilder *createX86MCPlusBuilder(const MCInstrAnalysis *,
                                       const MCInstrInfo *,
-                                      const MCRegisterInfo *);
+                                      const MCRegisterInfo *,
+                                      const MCSubtargetInfo *);
 
 MCPlusBuilder *createAArch64MCPlusBuilder(const MCInstrAnalysis *,
                                           const MCInstrInfo *,
-                                          const MCRegisterInfo *);
+                                          const MCRegisterInfo *,
+                                          const MCSubtargetInfo *);
 
 MCPlusBuilder *createRISCVMCPlusBuilder(const MCInstrAnalysis *,
                                         const MCInstrInfo *,
-                                        const MCRegisterInfo *);
+                                        const MCRegisterInfo *,
+                                        const MCSubtargetInfo *);
 
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/include/bolt/Core/Relocation.h b/bolt/include/bolt/Core/Relocation.h
index 5ae288a91986e5..bc16d952e7a291 100644
--- a/bolt/include/bolt/Core/Relocation.h
+++ b/bolt/include/bolt/Core/Relocation.h
@@ -84,6 +84,7 @@ struct Relocation {
 
   /// Special relocation type that allows the linker to modify the instruction.
   static bool isX86GOTPCRELX(uint64_t Type);
+  static bool isX86GOTPC64(uint64_t Type);
 
   /// Return true if relocation type is NONE
   static bool isNone(uint64_t Type);
@@ -97,6 +98,10 @@ struct Relocation {
   /// Return true if relocation type is for thread local storage.
   static bool isTLS(uint64_t Type);
 
+  /// Return true of relocation type is for referencing a specific instruction
+  /// (as opposed to a function, basic block, etc).
+  static bool isInstructionReference(uint64_t Type);
+
   /// Return code for a NONE relocation
   static uint64_t getNone();
 
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 1dacdc944f9775..329fe356603d0e 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -584,7 +584,8 @@ class RewriteInstance {
 MCPlusBuilder *createMCPlusBuilder(const Triple::ArchType Arch,
                                    const MCInstrAnalysis *Analysis,
                                    const MCInstrInfo *Info,
-                                   const MCRegisterInfo *RegInfo);
+                                   const MCRegisterInfo *RegInfo,
+                                   const MCSubtargetInfo *STI);
 
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 8132e5c213af44..b5514228d7a254 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1026,6 +1026,31 @@ BinaryContext::getBinaryDataContainingAddressImpl(uint64_t Address) const {
   return nullptr;
 }
 
+BinaryData *BinaryContext::getGOTSymbol() {
+  // First tries to find a global symbol with that name
+  BinaryData *GOTSymBD = getBinaryDataByName("_GLOBAL_OFFSET_TABLE_");
+  if (GOTSymBD)
+    return GOTSymBD;
+
+  // This symbol might be hidden from run-time link, so fetch the local
+  // definition if available.
+  GOTSymBD = getBinaryDataByName("_GLOBAL_OFFSET_TABLE_/1");
+  if (!GOTSymBD)
+    return nullptr;
+
+  // If the local symbol is not unique, fail
+  unsigned Index = 2;
+  SmallString<30> Storage;
+  while (const BinaryData *BD =
+             getBinaryDataByName(Twine("_GLOBAL_OFFSET_TABLE_/")
+                                     .concat(Twine(Index++))
+                                     .toStringRef(Storage)))
+    if (BD->getAddress() != GOTSymBD->getAddress())
+      return nullptr;
+
+  return GOTSymBD;
+}
+
 bool BinaryContext::setBinaryDataSize(uint64_t Address, uint64_t Size) {
   auto NI = BinaryDataMap.find(Address);
   assert(NI != BinaryDataMap.end());
@@ -1863,6 +1888,8 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
   }
   if (std::optional<uint32_t> Offset = MIB->getOffset(Instruction))
     OS << " # Offset: " << *Offset;
+  if (auto Label = MIB->getLabel(Instruction))
+    OS << " # Label: " << **Label;
 
   MIB->printAnnotations(Instruction, OS);
 
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 95ab63521c06a5..b1ee6cc221d716 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -498,6 +498,9 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
         BB->getLocSyms().emplace_back(Offset, LocSym);
       }
 
+      if (auto Label = BC.MIB->getLabel(Instr))
+        Streamer.emitLabel(*Label);
+
       Streamer.emitInstruction(Instr, *BC.STI);
       LastIsPrefix = BC.MIB->isPrefix(Instr);
     }
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 6cf1cfb50a9c12..c475e4f9035fe0 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1173,6 +1173,13 @@ bool BinaryFunction::disassemble() {
   // basic block.
   Labels[0] = Ctx->createNamedTempSymbol("BB0");
 
+  // Map offsets in the function to a label that should always point to the
+  // corresponding instruction. This is used for labels that shouldn't point to
+  // the start of a basic block but always to a specific instruction. This is
+  // used, for example, on RISC-V where %pcrel_lo relocations point to the
+  // corresponding %pcrel_hi.
+  LabelsMapType InstructionLabels;
+
   uint64_t Size = 0; // instruction size
   for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) {
     MCInst Instruction;
@@ -1329,9 +1336,23 @@ bool BinaryFunction::disassemble() {
                 ItrE = Relocations.lower_bound(Offset + Size);
            Itr != ItrE; ++Itr) {
         const Relocation &Relocation = Itr->second;
+        MCSymbol *Symbol = Relocation.Symbol;
+
+        if (Relocation::isInstructionReference(Relocation.Type)) {
+          uint64_t RefOffset = Relocation.Value - getAddress();
+          LabelsMapType::iterator LI = InstructionLabels.find(RefOffset);
+
+          if (LI == InstructionLabels.end()) {
+            Symbol = BC.Ctx->createNamedTempSymbol();
+            InstructionLabels.emplace(RefOffset, Symbol);
+          } else {
+            Symbol = LI->second;
+          }
+        }
+
         int64_t Value = Relocation.Value;
         const bool Result = BC.MIB->replaceImmWithSymbolRef(
-            Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value,
+            Instruction, Symbol, Relocation.Addend, Ctx.get(), Value,
             Relocation.Type);
         (void)Result;
         assert(Result && "cannot replace immediate with relocation");
@@ -1366,6 +1387,13 @@ bool BinaryFunction::disassemble() {
     addInstruction(Offset, std::move(Instruction));
   }
 
+  for (auto [Offset, Label] : InstructionLabels) {
+    InstrMapType::iterator II = Instructions.find(Offset);
+    assert(II != Instructions.end() && "reference to non-existing instruction");
+
+    BC.MIB->setLabel(II->second, Label);
+  }
+
   // Reset symbolizer for the disassembler.
   BC.SymbolicDisAsm->setSymbolizer(nullptr);
 
@@ -1761,7 +1789,8 @@ bool BinaryFunction::postProcessIndirectBranches(
   uint64_t LastJT = 0;
   uint16_t LastJTIndexReg = BC.MIB->getNoRegister();
   for (BinaryBasicBlock &BB : blocks()) {
-    for (MCInst &Instr : BB) {
+    for (BinaryBasicBlock::iterator II = BB.begin(); II != BB.end(); ++II) {
+      MCInst &Instr = *II;
       if (!BC.MIB->isIndirectBranch(Instr))
         continue;
 
@@ -1789,7 +1818,7 @@ bool BinaryFunction::postProcessIndirectBranches(
         const MCExpr *DispExpr;
         MCInst *PCRelBaseInstr;
         IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
-            Instr, BB.begin(), BB.end(), PtrSize, MemLocInstr, BaseRegNum,
+            Instr, BB.begin(), II, PtrSize, MemLocInstr, BaseRegNum,
             IndexRegNum, DispValue, DispExpr, PCRelBaseInstr);
         if (Type != IndirectBranchType::UNKNOWN || MemLocInstr != nullptr)
           continue;
@@ -4488,7 +4517,7 @@ void BinaryFunction::addRelocation(uint64_t Address, MCSymbol *Symbol,
   uint64_t Offset = Address - getAddress();
   LLVM_DEBUG(dbgs() << "BOLT-DEBUG: addRelocation in "
                     << formatv("{0}@{1:x} against {2}\n", *this, Offset,
-                               Symbol->getName()));
+                               (Symbol ? Symbol->getName() : "<undef>")));
   bool IsCI = BC.isAArch64() && isInConstantIsland(Address);
   std::map<uint64_t, Relocation> &Rels =
       IsCI ? Islands->Relocations : Relocations;
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index 8d24c9281b8f2a..22ca8c5acccf12 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -495,7 +495,8 @@ void DebugAddrWriterDwarf5::update(DIEBuilder &DIEBlder, DWARFUnit &CU) {
   const endianness Endian =
       BC->DwCtx->isLittleEndian() ? support::little : support::big;
   const DWARFSection &AddrSec = BC->DwCtx->getDWARFObj().getAddrSection();
-  DWARFDataExtractor AddrData(BC->DwCtx->getDWARFObj(), AddrSec, Endian, 0);
+  DWARFDataExtractor AddrData(BC->DwCtx->getDWARFObj(), AddrSec,
+                              Endian == support::little, 0);
   DWARFDebugAddrTable AddrTable;
   DIDumpOptions DumpOpts;
   constexpr uint32_t HeaderSize = 8;
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index 027cef1063ee3e..0a5eb44e4876fe 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -268,6 +268,17 @@ bool MCPlusBuilder::clearOffset(MCInst &Inst) {
   return true;
 }
 
+std::optional<MCSymbol *> MCPlusBuilder::getLabel(const MCInst &Inst) const {
+  if (auto Label = tryGetAnnotationAs<MCSymbol *>(Inst, MCAnnotation::kLabel))
+    return *Label;
+  return std::nullopt;
+}
+
+bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) {
+  getOrCreateAnnotationAs<MCSymbol *>(Inst, MCAnnotation::kLabel) = Label;
+  return true;
+}
+
 bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const {
   const MCInst *AnnotationInst = getAnnotationInst(Inst);
   if (!AnnotationInst)
diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp
index a73545905c545f..e4d0f26c305f4e 100644
--- a/bolt/lib/Core/Relocation.cpp
+++ b/bolt/lib/Core/Relocation.cpp
@@ -20,6 +20,13 @@
 using namespace llvm;
 using namespace bolt;
 
+namespace ELFReserved {
+enum {
+  R_RISCV_TPREL_I = 49,
+  R_RISCV_TPREL_S = 50,
+};
+} // namespace ELFReserved
+
 Triple::ArchType Relocation::Arch;
 
 static bool isSupportedX86(uint64_t Type) {
@@ -35,6 +42,7 @@ static bool isSupportedX86(uint64_t Type) {
   case ELF::R_X86_64_PC32:
   case ELF::R_X86_64_PC64:
   case ELF::R_X86_64_PLT32:
+  case ELF::R_X86_64_GOTPC64:
   case ELF::R_X86_64_GOTPCREL:
   case ELF::R_X86_64_GOTTPOFF:
   case ELF::R_X86_64_TPOFF32:
@@ -110,6 +118,13 @@ static bool isSupportedRISCV(uint64_t Type) {
   case ELF::R_RISCV_LO12_I:
   case ELF::R_RISCV_LO12_S:
   case ELF::R_RISCV_64:
+  case ELF::R_RISCV_TLS_GOT_HI20:
+  case ELF::R_RISCV_TPREL_HI20:
+  case ELF::R_RISCV_TPREL_ADD:
+  case ELF::R_RISCV_TPREL_LO12_I:
+  case ELF::R_RISCV_TPREL_LO12_S:
+  case ELFReserved::R_RISCV_TPREL_I:
+  case ELFReserved::R_RISCV_TPREL_S:
     return true;
   }
 }
@@ -136,6 +151,7 @@ static size_t getSizeForTypeX86(uint64_t Type) {
     return 4;
   case ELF::R_X86_64_PC64:
   case ELF::R_X86_64_64:
+  case ELF::R_X86_64_GOTPC64:
     return 8;
   }
 }
@@ -212,6 +228,7 @@ static size_t getSizeForTypeRISCV(uint64_t Type) {
     return 4;
   case ELF::R_RISCV_64:
   case ELF::R_RISCV_GOT_HI20:
+  case ELF::R_RISCV_TLS_GOT_HI20:
     // See extractValueRISCV for why this is necessary.
     return 8;
   }
@@ -530,6 +547,7 @@ static uint64_t extractValueRISCV(uint64_t Type, uint64_t Contents,
   case ELF::R_RISCV_BRANCH:
     return extractBImmRISCV(Contents);
   case ELF::R_RISCV_GOT_HI20:
+  case ELF::R_RISCV_TLS_GOT_HI20:
     // We need to know the exact address of the GOT entry so we extract the
     // value from both the AUIPC and L[D|W]. We cannot rely on the symbol in the
     // relocation for this since it simply refers to the object that is stored
@@ -598,6 +616,7 @@ static bool isGOTRISCV(uint64_t Type) {
   default:
     return false;
   case ELF::R_RISCV_GOT_HI20:
+  case ELF::R_RISCV_TLS_GOT_HI20:
     return true;
   }
 }
@@ -634,6 +653,14 @@ static bool isTLSRISCV(uint64_t Type) {
   switch (Type) {
   default:
     return false;
+  case ELF::R_RISCV_TLS_GOT_HI20:
+  case ELF::R_RISCV_TPREL_HI20:
+  case ELF::R_RISCV_TPREL_ADD:
+  case ELF::R_RISCV_TPREL_LO12_I:
+  case ELF::R_RISCV_TPREL_LO12_S:
+  case ELFReserved::R_RISCV_TPREL_I:
+  case ELFReserved::R_RISCV_TPREL_S:
+    return true;
   }
 }
 
@@ -655,6 +682,7 @@ static bool isPCRelativeX86(uint64_t Type) {
   case ELF::R_X86_64_PLT32:
   case ELF::R_X86_64_GOTOFF64:
   case ELF::R_X86_64_GOTPC32:
+  case ELF::R_X86_64_GOTPC64:
   case ELF::R_X86_64_GOTTPOFF:
   case ELF::R_X86_64_GOTPCRELX:
   case ELF::R_X86_64_REX_GOTPCRELX:
@@ -730,6 +758,7 @@ static bool isPCRelativeRISCV(uint64_t Type) {
   case ELF::R_RISCV_RVC_JUMP:
   case ELF::R_RISCV_RVC_BRANCH:
   case ELF::R_RISCV_32_PCREL:
+  case ELF::R_RISCV_TLS_GOT_HI20:
     return true;
   }
 }
@@ -797,6 +826,12 @@ bool Relocation::isX86GOTPCRELX(uint64_t Type) {
   return Type == ELF::R_X86_64_GOTPCRELX || Type == ELF::R_X86_64_REX_GOTPCRELX;
 }
 
+bool Relocation::isX86GOTPC64(uint64_t Type) {
+  if (Arch != Triple::x86_64)
+    return false;
+  return Type == ELF::R_X86_64_GOTPC64;
+}
+
 bool Relocation::isNone(uint64_t Type) { return Type == getNone(); }
 
 bool Relocation::isRelative(uint64_t Type) {
@@ -823,6 +858,19 @@ bool Relocation::isTLS(uint64_t Type) {
   return isTLSX86(Type);
 }
 
+bool Relocation::isInstructionReference(uint64_t Type) {
+  if (Arch != Triple::riscv64)
+    return false;
+
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_RISCV_PCREL_LO12_I:
+  case ELF::R_RISCV_PCREL_LO12_S:
+    return true;
+  }
+}
+
 uint64_t Relocation::getNone() {
   if (Arch == Triple::aarch64)
     return ELF::R_AARCH64_NONE;
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index bb760ea93ad16e..3ba53d7b2b7988 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -575,6 +575,7 @@ bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
 
 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
   std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
+  std::vector<std::pair<MCInst *, MCSymbol *>> PreservedLabelAnnotations;
 
   for (auto &It : BC.getBinaryFunctions()) {
     BinaryFunction &BF = It.second;
@@ -609,6 +610,8 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
           if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II))
             PreservedOffsetAnnotations.emplace_back(&(*II),
                                                     *BC.MIB->getOffset(*II));
+          if (auto Label = BC.MIB->getLabel(*II))
+            PreservedLabelAnnotations.emplace_back(&*II, *Label);
           BC.MIB->stripAnnotations(*II);
         }
       }
@@ -625,6 +628,8 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
   // Reinsert preserved annotations we need during code emission.
   for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
     BC.MIB->setOffset(*Item.first, Item.second);
+  for (auto [Instr, Label] : PreservedLabelAnnotations)
+    BC.MIB->setLabel(*Instr, Label);
 }
 
 // Check for dirty state in MCSymbol objects that might be a consequence
diff --git a/bolt/lib/Passes/FixRISCVCallsPass.cpp b/bolt/lib/Passes/FixRISCVCallsPass.cpp
index 963a8b04cf9db2..e2984deda16dc3 100644
--- a/bolt/lib/Passes/FixRISCVCallsPass.cpp
+++ b/bolt/lib/Passes/FixRISCVCallsPass.cpp
@@ -43,7 +43,12 @@ void FixRISCVCallsPass::runOnFunction(BinaryFunction &BF) {
 
         MCInst OldCall = *NextII;
         auto L = BC.scopeLock();
-        MIB->createCall(*II, Target, Ctx);
+
+        if (MIB->isTailCall(*NextII))
+          MIB->createTailCall(*II, Target, Ctx);
+        else
+          MIB->createCall(*II, Target, Ctx);
+
         MIB->moveAnnotations(std::move(OldCall), *II);
 
         // The original offset was set on the jalr of the auipc+jalr pair. Since
diff --git a/bolt/lib/Rewrite/JITLinkLinker.cpp b/bolt/lib/Rewrite/JITLinkLinker.cpp
index 1df086a665789d..5e06121f4df271 100644
--- a/bolt/lib/Rewrite/JITLinkLinker.cpp
+++ b/bolt/lib/Rewrite/JITLinkLinker.cpp
@@ -141,6 +141,19 @@ struct JITLinkLinker::Context : jitlink::JITLinkContext {
             orc::ExecutorAddr(Address), JITSymbolFlags());
         continue;
       }
+
+      if (Linker.BC.isGOTSymbol(SymName)) {
+        if (const BinaryData *I = Linker.BC.getGOTSymbol()) {
+          uint64_t Address =
+              I->isMoved() ? I->getOutputAddress() : I->getAddress();
+          LLVM_DEBUG(dbgs() << "Resolved to address 0x"
+                            << Twine::utohexstr(Address) << "\n");
+          AllResults[Symbol.first] = orc::ExecutorSymbolDef(
+              orc::ExecutorAddr(Address), JITSymbolFlags());
+          continue;
+        }
+      }
+
       LLVM_DEBUG(dbgs() << "Resolved to address 0x0\n");
       AllResults[Symbol.first] =
           orc::ExecutorSymbolDef(orc::ExecutorAddr(0), JITSymbolFlags());
diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp
index b827a196c82653..8be8257f15c1ce 100644
--- a/bolt/lib/Rewrite/MachORewriteInstance.cpp
+++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp
@@ -56,25 +56,28 @@ namespace bolt {
 
 extern MCPlusBuilder *createX86MCPlusBuilder(const MCInstrAnalysis *,
                                              const MCInstrInfo *,
-                                             const MCRegisterInfo *);
+                                             const MCRegisterInfo *,
+                                             const MCSubtargetInfo *);
 extern MCPlusBuilder *createAArch64MCPlusBuilder(const MCInstrAnalysis *,
                                                  const MCInstrInfo *,
-                                                 const MCRegisterInfo *);
+                                                 const MCRegisterInfo *,
+                                                 const MCSubtargetInfo *);
 
 namespace {
 
 MCPlusBuilder *createMCPlusBuilder(const Triple::ArchType Arch,
                                    const MCInstrAnalysis *Analysis,
                                    const MCInstrInfo *Info,
-                                   const MCRegisterInfo *RegInfo) {
+                                   const MCRegisterInfo *RegInfo,
+                                   const MCSubtargetInfo *STI) {
 #ifdef X86_AVAILABLE
   if (Arch == Triple::x86_64)
-    return createX86MCPlusBuilder(Analysis, Info, RegInfo);
+    return createX86MCPlusBuilder(Analysis, Info, RegInfo, STI);
 #endif
 
 #ifdef AARCH64_AVAILABLE
   if (Arch == Triple::aarch64)
-    return createAArch64MCPlusBuilder(Analysis, Info, RegInfo);
+    return createAArch64MCPlusBuilder(Analysis, Info, RegInfo, STI);
 #endif
 
   llvm_unreachable("architecture unsupported by MCPlusBuilder");
@@ -106,8 +109,9 @@ MachORewriteInstance::MachORewriteInstance(object::MachOObjectFile *InputFile,
     return;
   }
   BC = std::move(BCOrErr.get());
-  BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(createMCPlusBuilder(
-      BC->TheTriple->getArch(), BC->MIA.get(), BC->MII.get(), BC->MRI.get())));
+  BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(
+      createMCPlusBuilder(BC->TheTriple->getArch(), BC->MIA.get(),
+                          BC->MII.get(), BC->MRI.get(), BC->STI.get())));
   if (opts::Instrument)
     BC->setRuntimeLibrary(std::make_unique<InstrumentationRuntimeLibrary>());
 }
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 03cbe0461b2112..8b821eb984b950 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -272,20 +272,21 @@ extern const char *BoltRevision;
 MCPlusBuilder *createMCPlusBuilder(const Triple::ArchType Arch,
                                    const MCInstrAnalysis *Analysis,
                                    const MCInstrInfo *Info,
-                                   const MCRegisterInfo *RegInfo) {
+                                   const MCRegisterInfo *RegInfo,
+                                   const MCSubtargetInfo *STI) {
 #ifdef X86_AVAILABLE
   if (Arch == Triple::x86_64)
-    return createX86MCPlusBuilder(Analysis, Info, RegInfo);
+    return createX86MCPlusBuilder(Analysis, Info, RegInfo, STI);
 #endif
 
 #ifdef AARCH64_AVAILABLE
   if (Arch == Triple::aarch64)
-    return createAArch64MCPlusBuilder(Analysis, Info, RegInfo);
+    return createAArch64MCPlusBuilder(Analysis, Info, RegInfo, STI);
 #endif
 
 #ifdef RISCV_AVAILABLE
   if (Arch == Triple::riscv64)
-    return createRISCVMCPlusBuilder(Analysis, Info, RegInfo);
+    return createRISCVMCPlusBuilder(Analysis, Info, RegInfo, STI);
 #endif
 
   llvm_unreachable("architecture unsupported by MCPlusBuilder");
@@ -348,8 +349,9 @@ RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const int Argc,
     return;
   }
   BC = std::move(BCOrErr.get());
-  BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(createMCPlusBuilder(
-      BC->TheTriple->getArch(), BC->MIA.get(), BC->MII.get(), BC->MRI.get())));
+  BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(
+      createMCPlusBuilder(BC->TheTriple->getArch(), BC->MIA.get(),
+                          BC->MII.get(), BC->MRI.get(), BC->STI.get())));
 
   BAT = std::make_unique<BoltAddressTranslation>();
 
@@ -408,8 +410,9 @@ static bool checkOffsets(const typename ELFT::Phdr &Phdr,
     return true;
 
   // Only non-empty sections can be at the end of a segment.
-  uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1;
-  AddressRange SectionAddressRange(Sec.sh_offset, Sec.sh_offset + SectionSize);
+  uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1ull;
+  AddressRange SectionAddressRange((uint64_t)Sec.sh_offset,
+                                   Sec.sh_offset + SectionSize);
   AddressRange SegmentAddressRange(Phdr.p_offset,
                                    Phdr.p_offset + Phdr.p_filesz);
   if (SegmentAddressRange.contains(SectionAddressRange))
@@ -425,8 +428,9 @@ template <class ELFT>
 static bool checkVMA(const typename ELFT::Phdr &Phdr,
                      const typename ELFT::Shdr &Sec, bool &Overlap) {
   // Only non-empty sections can be at the end of a segment.
-  uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1;
-  AddressRange SectionAddressRange(Sec.sh_addr, Sec.sh_addr + SectionSize);
+  uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1ull;
+  AddressRange SectionAddressRange((uint64_t)Sec.sh_addr,
+                                   Sec.sh_addr + SectionSize);
   AddressRange SegmentAddressRange(Phdr.p_vaddr, Phdr.p_vaddr + Phdr.p_memsz);
 
   if (SegmentAddressRange.contains(SectionAddressRange))
@@ -2332,7 +2336,8 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
     if (BC->isX86())
       return;
 
-    // The non-got related TLS relocations on AArch64 also could be skipped.
+    // The non-got related TLS relocations on AArch64 and RISC-V also could be
+    // skipped.
     if (!Relocation::isGOT(RType))
       return;
   }
@@ -2394,9 +2399,13 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
   }
 
   MCSymbol *ReferencedSymbol = nullptr;
-  if (!IsSectionRelocation)
+  if (!IsSectionRelocation) {
     if (BinaryData *BD = BC->getBinaryDataByName(SymbolName))
       ReferencedSymbol = BD->getSymbol();
+    else if (BC->isGOTSymbol(SymbolName))
+      if (BinaryData *BD = BC->getGOTSymbol())
+        ReferencedSymbol = BD->getSymbol();
+  }
 
   ErrorOr<BinarySection &> ReferencedSection{std::errc::bad_address};
   symbol_iterator SymbolIter = Rel.getSymbol();
@@ -2537,7 +2546,17 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
     // Adjust the point of reference to a code location inside a function.
     if (ReferencedBF->containsAddress(Address, /*UseMaxSize = */ true)) {
       RefFunctionOffset = Address - ReferencedBF->getAddress();
-      if (RefFunctionOffset) {
+      if (Relocation::isInstructionReference(RType)) {
+        // Instruction labels are created while disassembling so we just leave
+        // the symbol empty for now. Since the extracted value is typically
+        // unrelated to the referenced symbol (e.g., %pcrel_lo in RISC-V
+        // references an instruction but the patched value references the low
+        // bits of a data address), we set the extracted value to the symbol
+        // address in order to be able to correctly reconstruct the reference
+        // later.
+        ReferencedSymbol = nullptr;
+        ExtractedValue = Address;
+      } else if (RefFunctionOffset) {
         if (ContainingBF && ContainingBF != ReferencedBF) {
           ReferencedSymbol =
               ReferencedBF->addEntryPointAtOffset(RefFunctionOffset);
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index bf77244102a21b..c2a4607411a49e 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -128,9 +128,7 @@ static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) {
 }
 class AArch64MCPlusBuilder : public MCPlusBuilder {
 public:
-  AArch64MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
-                       const MCRegisterInfo *RegInfo)
-      : MCPlusBuilder(Analysis, Info, RegInfo) {}
+  using MCPlusBuilder::MCPlusBuilder;
 
   bool equals(const MCTargetExpr &A, const MCTargetExpr &B,
               CompFuncTy Comp) const override {
@@ -1654,8 +1652,9 @@ namespace bolt {
 
 MCPlusBuilder *createAArch64MCPlusBuilder(const MCInstrAnalysis *Analysis,
                                           const MCInstrInfo *Info,
-                                          const MCRegisterInfo *RegInfo) {
-  return new AArch64MCPlusBuilder(Analysis, Info, RegInfo);
+                                          const MCRegisterInfo *RegInfo,
+                                          const MCSubtargetInfo *STI) {
+  return new AArch64MCPlusBuilder(Analysis, Info, RegInfo, STI);
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index c2e64039a25007..af7645f5684711 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -46,6 +46,7 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
     case ELF::R_RISCV_HI20:
     case ELF::R_RISCV_LO12_I:
     case ELF::R_RISCV_LO12_S:
+    case ELF::R_RISCV_TLS_GOT_HI20:
       return true;
     default:
       llvm_unreachable("Unexpected RISCV relocation type in code");
@@ -86,6 +87,7 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
       return false;
     case RISCV::JALR:
     case RISCV::C_JALR:
+    case RISCV::C_JR:
       return true;
     }
   }
@@ -157,6 +159,17 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
     DispValue = 0;
     DispExpr = nullptr;
     PCRelBaseOut = nullptr;
+
+    // Check for the following long tail call sequence:
+    // 1: auipc xi, %pcrel_hi(sym)
+    // jalr zero, %pcrel_lo(1b)(xi)
+    if (Instruction.getOpcode() == RISCV::JALR && Begin != End) {
+      MCInst &PrevInst = *std::prev(End);
+      if (isRISCVCall(PrevInst, Instruction) &&
+          Instruction.getOperand(0).getReg() == RISCV::X0)
+        return IndirectBranchType::POSSIBLE_TAIL_CALL;
+    }
+
     return IndirectBranchType::UNKNOWN;
   }
 
@@ -396,6 +409,7 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
     default:
       return Expr;
     case ELF::R_RISCV_GOT_HI20:
+    case ELF::R_RISCV_TLS_GOT_HI20:
       // The GOT is reused so no need to create GOT relocations
     case ELF::R_RISCV_PCREL_HI20:
       return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_PCREL_HI, Ctx);
@@ -457,8 +471,9 @@ namespace bolt {
 
 MCPlusBuilder *createRISCVMCPlusBuilder(const MCInstrAnalysis *Analysis,
                                         const MCInstrInfo *Info,
-                                        const MCRegisterInfo *RegInfo) {
-  return new RISCVMCPlusBuilder(Analysis, Info, RegInfo);
+                                        const MCRegisterInfo *RegInfo,
+                                        const MCSubtargetInfo *STI) {
+  return new RISCVMCPlusBuilder(Analysis, Info, RegInfo, STI);
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 4cb9d61710d1da..ce8a4d69148544 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -87,9 +87,7 @@ static InstructionListType createIncMemory(const MCSymbol *Target,
 
 class X86MCPlusBuilder : public MCPlusBuilder {
 public:
-  X86MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
-                   const MCRegisterInfo *RegInfo)
-      : MCPlusBuilder(Analysis, Info, RegInfo) {}
+  using MCPlusBuilder::MCPlusBuilder;
 
   std::unique_ptr<MCSymbolizer>
   createTargetSymbolizer(BinaryFunction &Function,
@@ -403,6 +401,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     case ELF::R_X86_64_PC8:
     case ELF::R_X86_64_PC32:
     case ELF::R_X86_64_PC64:
+    case ELF::R_X86_64_GOTPC64:
     case ELF::R_X86_64_GOTPCRELX:
     case ELF::R_X86_64_REX_GOTPCRELX:
       return true;
@@ -3578,8 +3577,9 @@ namespace bolt {
 
 MCPlusBuilder *createX86MCPlusBuilder(const MCInstrAnalysis *Analysis,
                                       const MCInstrInfo *Info,
-                                      const MCRegisterInfo *RegInfo) {
-  return new X86MCPlusBuilder(Analysis, Info, RegInfo);
+                                      const MCRegisterInfo *RegInfo,
+                                      const MCSubtargetInfo *STI) {
+  return new X86MCPlusBuilder(Analysis, Info, RegInfo, STI);
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.cpp b/bolt/lib/Target/X86/X86MCSymbolizer.cpp
index 5eeb18467f84a6..ca7fe137152fd3 100644
--- a/bolt/lib/Target/X86/X86MCSymbolizer.cpp
+++ b/bolt/lib/Target/X86/X86MCSymbolizer.cpp
@@ -130,6 +130,15 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
   if (!Relocation)
     return processPCRelOperandNoRel();
 
+  // GOTPC64 is special because the X86 Assembler doesn't know how to emit
+  // a PC-relative 8-byte fixup, which is what we need to cover this. The
+  // only way to do this is to use the symbol name _GLOBAL_OFFSET_TABLE_.
+  if (Relocation::isX86GOTPC64(Relocation->Type)) {
+    auto [Sym, Addend] = handleGOTPC64(*Relocation, InstAddress);
+    addOperand(Sym, Addend);
+    return true;
+  }
+
   uint64_t SymbolValue = Relocation->Value - Relocation->Addend;
   if (Relocation->isPCRelative())
     SymbolValue += InstAddress + ImmOffset;
@@ -149,6 +158,26 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
   return true;
 }
 
+std::pair<MCSymbol *, uint64_t>
+X86MCSymbolizer::handleGOTPC64(const Relocation &R, uint64_t InstrAddr) {
+  BinaryContext &BC = Function.getBinaryContext();
+  const BinaryData *GOTSymBD = BC.getGOTSymbol();
+  if (!GOTSymBD || !GOTSymBD->getAddress()) {
+    errs() << "BOLT-ERROR: R_X86_GOTPC64 relocation is present but we did "
+              "not detect a valid  _GLOBAL_OFFSET_TABLE_ in symbol table\n";
+    exit(1);
+  }
+  // R_X86_GOTPC64 are not relative to the Reloc nor end of instruction,
+  // but the start of the MOVABSQ instruction. So the Target Address is
+  // whatever is encoded in the original operand when we disassembled
+  // the binary (here, R.Value) plus MOVABSQ address (InstrAddr).
+  // Here we extract the intended Addend by subtracting the real
+  // GOT addr.
+  const int64_t Addend = R.Value + InstrAddr - GOTSymBD->getAddress();
+  return std::make_pair(BC.Ctx->getOrCreateSymbol("_GLOBAL_OFFSET_TABLE_"),
+                        Addend);
+}
+
 void X86MCSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &CStream,
                                                       int64_t Value,
                                                       uint64_t Address) {}
diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.h b/bolt/lib/Target/X86/X86MCSymbolizer.h
index 83a039bd70310b..9ed18b69c74ce4 100644
--- a/bolt/lib/Target/X86/X86MCSymbolizer.h
+++ b/bolt/lib/Target/X86/X86MCSymbolizer.h
@@ -20,6 +20,9 @@ class X86MCSymbolizer : public MCSymbolizer {
   BinaryFunction &Function;
   bool CreateNewSymbols{true};
 
+  std::pair<MCSymbol *, uint64_t> handleGOTPC64(const Relocation &R,
+                                                uint64_t InstrAddr);
+
 public:
   X86MCSymbolizer(BinaryFunction &Function, bool CreateNewSymbols = true)
       : MCSymbolizer(*Function.getBinaryContext().Ctx.get(), nullptr),
diff --git a/bolt/test/RISCV/Inputs/tls-le-gnu-ld.yaml b/bolt/test/RISCV/Inputs/tls-le-gnu-ld.yaml
new file mode 100644
index 00000000000000..feec407ffdfdaf
--- /dev/null
+++ b/bolt/test/RISCV/Inputs/tls-le-gnu-ld.yaml
@@ -0,0 +1,92 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_RISCV
+  Entry:           0x100B0
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .text
+    LastSec:         .text
+    VAddr:           0x10000
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_TLS
+    Flags:           [ PF_R ]
+    FirstSec:        .tbss
+    LastSec:         .tbss
+    VAddr:           0x110C0
+    Align:           0x8
+    Offset:          0xc0
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x100B0
+    AddressAlign:    0x4
+    Content:         '13000000832202002320520067800000'
+  - Name:            .tbss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC, SHF_TLS ]
+    Address:         0x110C0
+    AddressAlign:    0x8
+    Size:            0x8
+  - Name:            .rela.text
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .text
+    Relocations:
+      - Offset:          0x100B4
+        Type:            R_RISCV_NONE
+      - Offset:          0x100B4
+        Type:            R_RISCV_RELAX
+      - Offset:          0x100B4
+        Type:            R_RISCV_NONE
+      - Offset:          0x100B4
+        Type:            R_RISCV_RELAX
+      - Offset:          0x100B4
+        Symbol:          i
+        Type:            0x31
+      - Offset:          0x100B4
+        Type:            R_RISCV_RELAX
+      - Offset:          0x100B8
+        Symbol:          i
+        Type:            0x32
+      - Offset:          0x100B8
+        Type:            R_RISCV_RELAX
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .text
+      - Name:            .rela.text
+      - Name:            .tbss
+      - Name:            .symtab
+      - Name:            .strtab
+      - Name:            .shstrtab
+Symbols:
+  - Name:            .text
+    Type:            STT_SECTION
+    Section:         .text
+    Value:           0x100B0
+  - Name:            .tbss
+    Type:            STT_SECTION
+    Section:         .tbss
+    Value:           0x110C0
+  - Name:            '__global_pointer$'
+    Index:           SHN_ABS
+    Binding:         STB_GLOBAL
+    Value:           0x118C0
+  - Name:            i
+    Type:            STT_TLS
+    Section:         .tbss
+    Binding:         STB_GLOBAL
+    Size:            0x8
+  - Name:            _start
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x100B0
+    Size:            0x10
+...
diff --git a/bolt/test/RISCV/call-annotations.s b/bolt/test/RISCV/call-annotations.s
index e2c5a1040faee1..bc539bb0ec7796 100644
--- a/bolt/test/RISCV/call-annotations.s
+++ b/bolt/test/RISCV/call-annotations.s
@@ -4,7 +4,7 @@
 // RUN: llvm-mc -triple riscv64 -filetype obj -o %t.o %s
 // RUN: ld.lld --emit-relocs -o %t %t.o
 // RUN: llvm-bolt --enable-bat --print-cfg --print-fix-riscv-calls \
-// RUN:     --print-only=_start -o /dev/null %t | FileCheck %s
+// RUN:     -o /dev/null %t | FileCheck %s
 
   .text
   .global f
@@ -19,11 +19,24 @@ f:
 // CHECK-NEXT: jal ra, f # Offset: 8
 // CHECK-NEXT: jal zero, f # TAILCALL  # Offset: 12
 
+// CHECK-LABEL: Binary Function "long_tail" after building cfg {
+// CHECK:      auipc t1, f
+// CHECK-NEXT: jalr zero, -24(t1) # TAILCALL  # Offset: 8
+
+// CHECK-LABEL: Binary Function "compressed_tail" after building cfg {
+// CHECK:      jr a0 # TAILCALL  # Offset: 0
+
 // CHECK-LABEL: Binary Function "_start" after fix-riscv-calls {
 // CHECK:      call f # Offset: 0
 // CHECK-NEXT: call f # Offset: 8
 // CHECK-NEXT: tail f # TAILCALL   # Offset: 12
 
+// CHECK-LABEL: Binary Function "long_tail" after fix-riscv-calls {
+// CHECK:      tail f # TAILCALL   # Offset: 4
+
+// CHECK-LABEL: Binary Function "compressed_tail" after fix-riscv-calls {
+// CHECK:      jr a0 # TAILCALL  # Offset: 0
+
   .globl _start
   .p2align 1
 _start:
@@ -31,3 +44,21 @@ _start:
   jal f
   jal zero, f
   .size _start, .-_start
+
+  .globl long_tail
+  .p2align 1
+long_tail:
+    // NOTE: BOLT assumes indirect calls in single-BB functions are tail calls
+    // so artificially introduce a second BB to force RISC-V-specific analysis
+    // to get triggered.
+    beq a0, a1, 1f
+1:
+    tail f
+    .size long_tail, .-long_tail
+
+   .globl compressed_tail
+   .p2align 1
+   .option rvc
+compressed_tail:
+    c.jr a0
+    .size compressed_tail, .-compressed_tail
diff --git a/bolt/test/RISCV/reloc-abs.s b/bolt/test/RISCV/reloc-abs.s
index 3e4b8b1395e1ff..5b728f092b3c9f 100644
--- a/bolt/test/RISCV/reloc-abs.s
+++ b/bolt/test/RISCV/reloc-abs.s
@@ -17,8 +17,7 @@ _start:
   .option push
   .option norelax
 1:
-// CHECK: .Ltmp0
-// CHECK: auipc gp, %pcrel_hi(__global_pointer$)
+// CHECK: auipc gp, %pcrel_hi(__global_pointer$) # Label: .Ltmp0
 // CHECK-NEXT: addi gp, gp, %pcrel_lo(.Ltmp0)
   auipc gp, %pcrel_hi(__global_pointer$)
   addi  gp, gp, %pcrel_lo(1b)
diff --git a/bolt/test/RISCV/reloc-bb-split.s b/bolt/test/RISCV/reloc-bb-split.s
new file mode 100644
index 00000000000000..5995562cf130b0
--- /dev/null
+++ b/bolt/test/RISCV/reloc-bb-split.s
@@ -0,0 +1,42 @@
+// RUN: %clang %cflags -o %t %s
+// RUN: llvm-bolt --print-cfg --print-only=_start -o /dev/null %t \
+// RUN:    | FileCheck %s
+
+  .data
+  .globl d
+  .p2align 3
+d:
+  .dword 0
+
+  .text
+  .globl _start
+  .p2align 1
+// CHECK-LABEL: Binary Function "_start" after building cfg {
+_start:
+/// The local label is used for %pcrel_lo as well as a jump target so a new
+/// basic block should start there.
+// CHECK-LABEL: {{^}}.LBB00
+// CHECK: nop
+// CHECK-LABEL: {{^}}.Ltmp0
+// CHECK: auipc t0, %pcrel_hi(d) # Label: .Ltmp1
+// CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp1)(t0)
+// CHECK-NEXT: j .Ltmp0
+  nop
+1:
+  auipc t0, %pcrel_hi(d)
+  ld t0, %pcrel_lo(1b)(t0)
+  j 1b
+
+/// The local label is used only for %pcrel_lo so no new basic block should
+/// start there.
+// CHECK-LABEL: {{^}}.LFT0
+// CHECK: nop
+// CHECK-NEXT: auipc t0, %pcrel_hi(d) # Label: .Ltmp2
+// CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp2)(t0)
+// CHECK-NEXT: ret
+  nop
+1:
+  auipc t0, %pcrel_hi(d)
+  ld t0, %pcrel_lo(1b)(t0)
+  ret
+  .size _start, .-_start
diff --git a/bolt/test/RISCV/reloc-got.s b/bolt/test/RISCV/reloc-got.s
index b6cd61be723bfa..dcf9d0ea3ffbf2 100644
--- a/bolt/test/RISCV/reloc-got.s
+++ b/bolt/test/RISCV/reloc-got.s
@@ -14,8 +14,7 @@ d:
 // CHECK: Binary Function "_start" after building cfg {
 _start:
   nop // Here to not make the _start and .Ltmp0 symbols coincide
-// CHECK: .Ltmp0
-// CHECK: auipc t0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}})
+// CHECK: auipc t0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}}) # Label: .Ltmp0
 // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0)
 1:
   auipc t0, %got_pcrel_hi(d)
diff --git a/bolt/test/RISCV/reloc-pcrel.s b/bolt/test/RISCV/reloc-pcrel.s
index 36b132727291ec..3ad3015a0a57fa 100644
--- a/bolt/test/RISCV/reloc-pcrel.s
+++ b/bolt/test/RISCV/reloc-pcrel.s
@@ -14,12 +14,10 @@ d:
 // CHECK: Binary Function "_start" after building cfg {
 _start:
   nop // Here to not make the _start and .Ltmp0 symbols coincide
-// CHECK: .Ltmp0
-// CHECK: auipc t0, %pcrel_hi(d)
+// CHECK: auipc t0, %pcrel_hi(d) # Label: .Ltmp0
 // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0)
   ld t0, d
-// CHECK: .Ltmp1
-// CHECK: auipc t1, %pcrel_hi(d)
+// CHECK-NEXT: auipc t1, %pcrel_hi(d) # Label: .Ltmp1
 // CHECK-NEXT: sd t0, %pcrel_lo(.Ltmp1)(t1)
   sd t0, d, t1
   ret
diff --git a/bolt/test/RISCV/reloc-tls.s b/bolt/test/RISCV/reloc-tls.s
new file mode 100644
index 00000000000000..6ced25b8e630a5
--- /dev/null
+++ b/bolt/test/RISCV/reloc-tls.s
@@ -0,0 +1,44 @@
+// RUN: llvm-mc -triple riscv64 -filetype obj -o %t.o %s
+// RUN: ld.lld --emit-relocs -o %t %t.o
+// RUN: llvm-bolt --print-cfg --print-only=tls_le,tls_ie -o /dev/null %t \
+// RUN:    | FileCheck %s
+
+// CHECK-LABEL: Binary Function "tls_le{{.*}}" after building cfg {
+// CHECK:      lui a5, 0
+// CHECK-NEXT: add a5, a5, tp
+// CHECK-NEXT: lw t0, 0(a5)
+// CHECK-NEXT: sw t0, 0(a5)
+
+// CHECK-LABEL: Binary Function "tls_ie" after building cfg {
+// CHECK-LABEL: .Ltmp0
+// CHECK:      auipc a0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}})
+// CHECK-NEXT: ld a0, %pcrel_lo(.Ltmp0)(a0)
+    .text
+    .globl tls_le, _start
+    .p2align 2
+tls_le:
+_start:
+    nop
+    lui a5, %tprel_hi(i)
+    add a5, a5, tp, %tprel_add(i)
+    lw t0, %tprel_lo(i)(a5)
+    sw t0, %tprel_lo(i)(a5)
+    ret
+    .size _start, .-_start
+
+    .globl tls_ie
+    .p2align 2
+tls_ie:
+    nop
+    la.tls.ie a0, i
+    ret
+    .size tls_ie, .-tls_ie
+
+    .section .tbss,"awT",@nobits
+    .type i,@object
+    .globl i
+    .p2align 3
+i:
+    .quad 0
+    .size i, .-i
+
diff --git a/bolt/test/RISCV/tls-le-gnu-ld.test b/bolt/test/RISCV/tls-le-gnu-ld.test
new file mode 100644
index 00000000000000..c3ff08b30ee60d
--- /dev/null
+++ b/bolt/test/RISCV/tls-le-gnu-ld.test
@@ -0,0 +1,11 @@
+// This test checks that the binaries produces with GNU ld TLS le relaxation are
+// properly processed by BOLT. GNU ld currently emits two non-standard
+// relocations (R_RISCV_TPREL_I and R_RISCV_TPREL_S) in this case.
+
+// RUN: yaml2obj %p/Inputs/tls-le-gnu-ld.yaml &> %t.exe
+// RUN: llvm-bolt %t.exe -o %t.bolt.exe --print-cfg --print-only=_start \
+// RUN:   | FileCheck %s
+
+// CHECK: Binary Function "_start" after building cfg {
+// CHECK:      lw t0, 0(tp)
+// CHECK-NEXT: sw t0, 0(tp)
diff --git a/bolt/test/checkvma-large-section.test b/bolt/test/checkvma-large-section.test
new file mode 100644
index 00000000000000..36a915951115e1
--- /dev/null
+++ b/bolt/test/checkvma-large-section.test
@@ -0,0 +1,35 @@
+# This test reproduces the issue with a section which ends at >4G address
+REQUIRES: asserts
+RUN: split-file %s %t
+RUN: yaml2obj %t/yaml -o %t.exe --max-size=0
+RUN: llvm-bolt %t.exe -o /dev/null --allow-stripped
+#--- yaml
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: ET_EXEC
+  Machine: EM_X86_64
+ProgramHeaders:
+  - Type: PT_LOAD
+    FirstSec: .a
+    LastSec: .a
+    Align: 0x1000
+  - Type: PT_LOAD
+    Flags: [ PF_R, PF_W ]
+    FirstSec: .large_sec
+    LastSec: .large_sec
+    VAddr: 0x4a0279a8
+  - Type: PT_GNU_RELRO
+    Flags: [ PF_R ]
+Sections:
+  - Name: .a
+    Type: SHT_PROGBITS
+    Content: 00
+    AddressAlign: 0x1
+  - Name: .large_sec
+    Type: SHT_PROGBITS
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Address: 0x4a0279a8
+    Size: 0xdf8bb1a0
+...
diff --git a/bolt/test/runtime/X86/gotoff-large-code-model-2.s b/bolt/test/runtime/X86/gotoff-large-code-model-2.s
new file mode 100644
index 00000000000000..b7f7bf0527455e
--- /dev/null
+++ b/bolt/test/runtime/X86/gotoff-large-code-model-2.s
@@ -0,0 +1,57 @@
+# A variation of gotoff-large-code-model.s that accesses GOT value
+# with a slightly different code sequence.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe --funcs init_impls --lite \
+# RUN:   -o %t.bolted
+# RUN: %t.bolted | FileCheck %s
+
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.LC2:
+	.string	"Hello, world\n"
+	.text
+	.p2align 4
+	.globl	init_impls
+	.type	init_impls, @function
+init_impls:
+	.cfi_startproc
+  push   %rbp
+  mov    %rsp,%rbp
+  push   %r15
+  push   %rbx
+  sub    $0x8,%rsp
+  lea    1f(%rip),%rbx
+  #  R_X86_64_GOTPC64  _GLOBAL_OFFSET_TABLE_+0x2
+1: movabsq $_GLOBAL_OFFSET_TABLE_, %r11
+  add    %r11,%rbx
+  #  R_X86_64_GOTOFF64 .LC2
+  movabs $.LC2@gotoff,%rax
+  lea    (%rbx,%rax,1),%rax
+  mov    %rax,%rdi
+  mov    %rbx,%r15
+  #  R_X86_64_PLTOFF64 puts
+  movabs $puts@pltoff,%rax
+  add    %rbx,%rax
+  call   *%rax
+  add    $0x8,%rsp
+  pop    %rbx
+  pop    %r15
+  pop    %rbp
+  retq
+  .cfi_endproc
+  .size init_impls, .-init_impls
+
+  .globl main
+  .type main, @function
+  .p2align 4
+main:
+  callq init_impls
+  xorq  %rax, %rax
+  ret
+
+# CHECK: Hello, world
diff --git a/bolt/test/runtime/X86/gotoff-large-code-model.s b/bolt/test/runtime/X86/gotoff-large-code-model.s
new file mode 100644
index 00000000000000..6ec61d2380c202
--- /dev/null
+++ b/bolt/test/runtime/X86/gotoff-large-code-model.s
@@ -0,0 +1,55 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-bolt %t.exe --funcs init_impls --lite \
+# RUN:   -o %t.bolted
+# RUN: %t.bolted | FileCheck %s
+
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.LC2:
+	.string	"Hello, world\n"
+	.text
+	.p2align 4
+	.globl	init_impls
+	.type	init_impls, @function
+init_impls:
+	.cfi_startproc
+  push   %rbp
+  mov    %rsp,%rbp
+  push   %r15
+  push   %rbx
+  sub    $0x8,%rsp
+1:
+  lea    1b(%rip),%rbx
+  #  R_X86_64_GOTPC64  _GLOBAL_OFFSET_TABLE_+0x9
+  movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r11
+  add    %r11,%rbx
+  #  R_X86_64_GOTOFF64 .LC2
+  movabs $.LC2@gotoff,%rax
+  lea    (%rbx,%rax,1),%rax
+  mov    %rax,%rdi
+  mov    %rbx,%r15
+  #  R_X86_64_PLTOFF64 puts
+  movabs $puts@pltoff,%rax
+  add    %rbx,%rax
+  call   *%rax
+  add    $0x8,%rsp
+  pop    %rbx
+  pop    %r15
+  pop    %rbp
+  retq
+  .cfi_endproc
+  .size init_impls, .-init_impls
+
+  .globl main
+  .type main, @function
+  .p2align 4
+main:
+  callq init_impls
+  xorq  %rax, %rax
+  ret
+
+# CHECK: Hello, world
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index 6519ff6934d1f8..b851c756e7960e 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -52,8 +52,9 @@ struct MCPlusBuilderTester : public testing::TestWithParam<Triple::ArchType> {
     BC = cantFail(BinaryContext::createBinaryContext(
         ObjFile.get(), true, DWARFContext::create(*ObjFile.get())));
     ASSERT_FALSE(!BC);
-    BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(createMCPlusBuilder(
-        GetParam(), BC->MIA.get(), BC->MII.get(), BC->MRI.get())));
+    BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(
+        createMCPlusBuilder(GetParam(), BC->MIA.get(), BC->MII.get(),
+                            BC->MRI.get(), BC->STI.get())));
   }
 
   void testRegAliases(Triple::ArchType Arch, uint64_t Register,
diff --git a/clang-tools-extra/clang-change-namespace/tool/ClangChangeNamespace.cpp b/clang-tools-extra/clang-change-namespace/tool/ClangChangeNamespace.cpp
index 5f30cbf8fb4773..22d26db0c11bcf 100644
--- a/clang-tools-extra/clang-change-namespace/tool/ClangChangeNamespace.cpp
+++ b/clang-tools-extra/clang-change-namespace/tool/ClangChangeNamespace.cpp
@@ -152,8 +152,8 @@ int main(int argc, const char **argv) {
       for (auto I = ChangedFiles.begin(), E = ChangedFiles.end(); I != E; ++I) {
         OS << "  {\n";
         OS << "    \"FilePath\": \"" << *I << "\",\n";
-        const auto Entry = FileMgr.getFile(*I);
-        auto ID = Sources.getOrCreateFileID(*Entry, SrcMgr::C_User);
+        auto Entry = llvm::cantFail(FileMgr.getFileRef(*I));
+        auto ID = Sources.getOrCreateFileID(Entry, SrcMgr::C_User);
         std::string Content;
         llvm::raw_string_ostream ContentStream(Content);
         Rewrite.getEditBuffer(ID).write(ContentStream);
@@ -170,9 +170,9 @@ int main(int argc, const char **argv) {
   }
 
   for (const auto &File : ChangedFiles) {
-    const auto Entry = FileMgr.getFile(File);
+    auto Entry = llvm::cantFail(FileMgr.getFileRef(File));
 
-    auto ID = Sources.getOrCreateFileID(*Entry, SrcMgr::C_User);
+    auto ID = Sources.getOrCreateFileID(Entry, SrcMgr::C_User);
     outs() << "============== " << File << " ==============\n";
     Rewrite.getEditBuffer(ID).write(llvm::outs());
     outs() << "\n============================================\n";
diff --git a/clang-tools-extra/clang-move/Move.cpp b/clang-tools-extra/clang-move/Move.cpp
index 94ba73ce9ad5a1..404acf55e3aa53 100644
--- a/clang-tools-extra/clang-move/Move.cpp
+++ b/clang-tools-extra/clang-move/Move.cpp
@@ -843,7 +843,7 @@ void ClangMoveTool::moveDeclsToNewFiles() {
 // Move all contents from OldFile to NewFile.
 void ClangMoveTool::moveAll(SourceManager &SM, StringRef OldFile,
                             StringRef NewFile) {
-  auto FE = SM.getFileManager().getFile(makeAbsolutePath(OldFile));
+  auto FE = SM.getFileManager().getOptionalFileRef(makeAbsolutePath(OldFile));
   if (!FE) {
     llvm::errs() << "Failed to get file: " << OldFile << "\n";
     return;
diff --git a/clang-tools-extra/clang-reorder-fields/tool/ClangReorderFields.cpp b/clang-tools-extra/clang-reorder-fields/tool/ClangReorderFields.cpp
index 375306765a52b6..5b77ee7b5738c6 100644
--- a/clang-tools-extra/clang-reorder-fields/tool/ClangReorderFields.cpp
+++ b/clang-tools-extra/clang-reorder-fields/tool/ClangReorderFields.cpp
@@ -84,8 +84,8 @@ int main(int argc, const char **argv) {
   Tool.applyAllReplacements(Rewrite);
 
   for (const auto &File : Files) {
-    auto Entry = FileMgr.getFile(File);
-    const auto ID = Sources.getOrCreateFileID(*Entry, SrcMgr::C_User);
+    auto Entry = llvm::cantFail(FileMgr.getFileRef(File));
+    const auto ID = Sources.getOrCreateFileID(Entry, SrcMgr::C_User);
     Rewrite.getEditBuffer(ID).write(outs());
   }
 
diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index 695bfd6e2edf7e..4b1a67b6dd98a9 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -243,7 +243,7 @@ class ErrorReporter {
     if (FilePath.empty())
       return {};
 
-    auto File = SourceMgr.getFileManager().getFile(FilePath);
+    auto File = SourceMgr.getFileManager().getOptionalFileRef(FilePath);
     if (!File)
       return {};
 
diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
index c1e4437b88949b..b19a84f5dc2157 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
@@ -194,9 +194,9 @@ DiagnosticBuilder ClangTidyContext::diag(
 
 DiagnosticBuilder ClangTidyContext::diag(const tooling::Diagnostic &Error) {
   SourceManager &SM = DiagEngine->getSourceManager();
-  llvm::ErrorOr<const FileEntry *> File =
-      SM.getFileManager().getFile(Error.Message.FilePath);
-  FileID ID = SM.getOrCreateFileID(*File, SrcMgr::C_User);
+  FileManager &FM = SM.getFileManager();
+  FileEntryRef File = llvm::cantFail(FM.getFileRef(Error.Message.FilePath));
+  FileID ID = SM.getOrCreateFileID(File, SrcMgr::C_User);
   SourceLocation FileStartLoc = SM.getLocForStartOfFile(ID);
   SourceLocation Loc = FileStartLoc.getLocWithOffset(
       static_cast<SourceLocation::IntTy>(Error.Message.FileOffset));
diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
index fed19bdcc29143..e336ba1ee1fa72 100644
--- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
@@ -30,6 +30,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/Tooling/Inclusions/HeaderIncludes.h"
+#include "clang/Tooling/Inclusions/StandardLibrary.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -97,9 +98,12 @@ bool IncludeCleanerCheck::shouldIgnore(const include_cleaner::Header &H) {
   return llvm::any_of(IgnoreHeadersRegex, [&H](const llvm::Regex &R) {
     switch (H.kind()) {
     case include_cleaner::Header::Standard:
+      // We don't trim angle brackets around standard library headers
+      // deliberately, so that they are only matched as <vector>, otherwise
+      // having just `.*/vector` might yield false positives.
       return R.match(H.standard().name());
     case include_cleaner::Header::Verbatim:
-      return R.match(H.verbatim());
+      return R.match(H.verbatim().trim("<>\""));
     case include_cleaner::Header::Physical:
       return R.match(H.physical().getFileEntry().tryGetRealPathName());
     }
@@ -179,12 +183,14 @@ void IncludeCleanerCheck::check(const MatchFinder::MatchResult &Result) {
       if (getCurrentMainFile().endswith(PHeader))
         continue;
     }
-
-    if (llvm::none_of(
-            IgnoreHeadersRegex,
-            [Resolved = (*I.Resolved).getFileEntry().tryGetRealPathName()](
-                const llvm::Regex &R) { return R.match(Resolved); }))
-      Unused.push_back(&I);
+    auto StdHeader = tooling::stdlib::Header::named(
+        I.quote(), PP->getLangOpts().CPlusPlus ? tooling::stdlib::Lang::CXX
+                                               : tooling::stdlib::Lang::C);
+    if (StdHeader && shouldIgnore(*StdHeader))
+      continue;
+    if (shouldIgnore(*I.Resolved))
+      continue;
+    Unused.push_back(&I);
   }
 
   llvm::StringRef Code = SM->getBufferData(SM->getMainFileID());
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
index 22dc9e21cab9d5..e6293ed48bfddb 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -61,7 +61,8 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
     // before the typedef will be the nested one (PR#50990). Therefore, we also
     // keep track of the parent declaration, so that we can look up the last
     // TagDecl that is a sibling of the typedef in the AST.
-    LastTagDeclRanges[ParentDecl] = MatchedTagDecl->getSourceRange();
+    if (MatchedTagDecl->isThisDeclarationADefinition())
+      LastTagDeclRanges[ParentDecl] = MatchedTagDecl->getSourceRange();
     return;
   }
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 8fc28c09034180..0ecf786e73bc41 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -249,14 +249,12 @@ Changes in existing checks
 - Improved :doc:`misc-const-correctness
   <clang-tidy/checks/misc/const-correctness>` check to avoid false positive when
   using pointer to member function.
-  
-- Improved :doc:`misc-include-cleaner
-  <clang-tidy/checks/misc/include-cleaner>` check by adding option
-  `DeduplicateFindings` to output one finding per symbol occurrence.
 
 - Improved :doc:`misc-include-cleaner
-  <clang-tidy/checks/misc/include-cleaner>` check to avoid fixes insert
-  same include header multiple times.
+  <clang-tidy/checks/misc/include-cleaner>` check by adding option
+  `DeduplicateFindings` to output one finding per symbol occurrence, avoid
+  inserting the same header multiple times, fix a bug where `IgnoreHeaders`
+  option won't work with verbatim/std headers.
 
 - Improved :doc:`misc-redundant-expression
   <clang-tidy/checks/misc/redundant-expression>` check to ignore
@@ -279,8 +277,8 @@ Changes in existing checks
   fixes for reordering arguments.
 
 - Improved :doc:`modernize-use-using
-  <clang-tidy/checks/modernize/use-using>` check to fix function pointer
-  ``typedef`` correctly.
+  <clang-tidy/checks/modernize/use-using>` check to fix function pointer and
+  forward declared ``typedef`` correctly.
 
 - Improved :doc:`performance-faster-string-find
   <clang-tidy/checks/performance/faster-string-find>` check to properly escape
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
index f7db0af6434ac4..422abee11a7196 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
@@ -321,3 +321,7 @@ typedef bool (*ISSUE_65055_2)(int);
 // CHECK-MESSAGES: :[[@LINE-2]]:1: warning: use 'using' instead of 'typedef'
 // CHECK-FIXES: {{^}}using ISSUE_65055_1 = void (*)(int);{{$}}
 // CHECK-FIXES: {{^}}using ISSUE_65055_2 = bool (*)(int);{{$}}
+
+typedef class ISSUE_67529_1 *ISSUE_67529;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef'
+// CHECK-FIXES: using ISSUE_67529 = class ISSUE_67529_1 *;
diff --git a/clang-tools-extra/unittests/clang-tidy/IncludeCleanerTest.cpp b/clang-tools-extra/unittests/clang-tidy/IncludeCleanerTest.cpp
index 0c29b469b617b7..8da1051a860a8c 100644
--- a/clang-tools-extra/unittests/clang-tidy/IncludeCleanerTest.cpp
+++ b/clang-tools-extra/unittests/clang-tidy/IncludeCleanerTest.cpp
@@ -59,18 +59,20 @@ TEST(IncludeCleanerCheckTest, SuppressUnusedIncludes) {
 #include "foo/qux.h"
 #include "baz/qux/qux.h"
 #include <vector>
+#include <list>
 )";
 
   const char *PostCode = R"(
 #include "bar.h"
 #include "foo/qux.h"
 #include <vector>
+#include <list>
 )";
 
   std::vector<ClangTidyError> Errors;
   ClangTidyOptions Opts;
   Opts.CheckOptions["IgnoreHeaders"] = llvm::StringRef{llvm::formatv(
-      "bar.h;{0};{1};vector",
+      "bar.h;{0};{1};vector;<list>;",
       llvm::Regex::escape(appendPathFileSystemIndependent({"foo", "qux.h"})),
       llvm::Regex::escape(appendPathFileSystemIndependent({"baz", "qux"})))};
   EXPECT_EQ(
@@ -79,6 +81,7 @@ TEST(IncludeCleanerCheckTest, SuppressUnusedIncludes) {
           PreCode, &Errors, "file.cpp", std::nullopt, Opts,
           {{"bar.h", "#pragma once"},
            {"vector", "#pragma once"},
+           {"list", "#pragma once"},
            {appendPathFileSystemIndependent({"foo", "qux.h"}), "#pragma once"},
            {appendPathFileSystemIndependent({"baz", "qux", "qux.h"}),
             "#pragma once"}}));
@@ -163,11 +166,13 @@ TEST(IncludeCleanerCheckTest, SuppressMissingIncludes) {
 int BarResult = bar();
 int BazResult = baz();
 int QuxResult = qux();
+int PrivResult = test();
+std::vector x;
 )";
 
   ClangTidyOptions Opts;
   Opts.CheckOptions["IgnoreHeaders"] = llvm::StringRef{
-      "baz.h;" +
+      "public.h;<vector>;baz.h;" +
       llvm::Regex::escape(appendPathFileSystemIndependent({"foo", "qux.h"}))};
   std::vector<ClangTidyError> Errors;
   EXPECT_EQ(PreCode, runCheckOnCode<IncludeCleanerCheck>(
@@ -175,18 +180,23 @@ int QuxResult = qux();
                          {{"bar.h", R"(#pragma once
                               #include "baz.h"
                               #include "foo/qux.h"
+                              #include "private.h"
                               int bar();
+                              namespace std { struct vector {}; }
                            )"},
                           {"baz.h", R"(#pragma once
                               int baz();
                            )"},
+                          {"private.h", R"(#pragma once
+                              // IWYU pragma: private, include "public.h"
+                              int test();
+                           )"},
                           {appendPathFileSystemIndependent({"foo", "qux.h"}),
                            R"(#pragma once
                               int qux();
                            )"}}));
 }
 
-
 TEST(IncludeCleanerCheckTest, MultipleTimeMissingInclude) {
   const char *PreCode = R"(
 #include "bar.h"
diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake
index fd341b182fd656..2a0953af53fadd 100644
--- a/clang/cmake/caches/CrossWinToARMLinux.cmake
+++ b/clang/cmake/caches/CrossWinToARMLinux.cmake
@@ -151,6 +151,10 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS
 
 find_package(Python3 COMPONENTS Interpreter)
 
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_TEST_PARAMS_default "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_TEST_PARAMS}")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_TEST_PARAMS_default "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_TEST_PARAMS}")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_TEST_PARAMS_default "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_TEST_PARAMS}")
+
 # Remote test configuration.
 if(DEFINED REMOTE_TEST_HOST)
   # Allow override with the custom values.
@@ -162,11 +166,15 @@ if(DEFINED REMOTE_TEST_HOST)
         "\\\"${Python3_EXECUTABLE}\\\" \\\"${LLVM_PROJECT_DIR}/llvm/utils/remote-exec.py\\\" --host=${REMOTE_TEST_USER}@${REMOTE_TEST_HOST}"
         CACHE STRING "")
 
-  set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_TEST_PARAMS "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_TEST_PARAMS} 'executor=${DEFAULT_TEST_EXECUTOR}'" CACHE STRING "")
-  set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_TEST_PARAMS "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_TEST_PARAMS} 'executor=${DEFAULT_TEST_EXECUTOR}'" CACHE STRING "")
-  set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_TEST_PARAMS    "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_TEST_PARAMS} 'executor=${DEFAULT_TEST_EXECUTOR}'" CACHE STRING "")
+  list(APPEND RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_TEST_PARAMS_default "executor=${DEFAULT_TEST_EXECUTOR}")
+  list(APPEND RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_TEST_PARAMS_default "executor=${DEFAULT_TEST_EXECUTOR}")
+  list(APPEND RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_TEST_PARAMS_default    "executor=${DEFAULT_TEST_EXECUTOR}")
 endif()
 
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_TEST_PARAMS "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_TEST_PARAMS_default}" CACHE INTERNAL "")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_TEST_PARAMS "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_TEST_PARAMS_default}" CACHE INTERNAL "")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_TEST_PARAMS "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_TEST_PARAMS_default}" CACHE INTERNAL "")
+
 set(LLVM_INSTALL_TOOLCHAIN_ONLY ON CACHE BOOL "")
 set(LLVM_TOOLCHAIN_TOOLS
   llvm-ar
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 5ebb8c82b30495..c5c72c68ee9b80 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -5730,7 +5730,7 @@ Examples
 ========
 
 A style similar to the `Linux Kernel style
-<https://www.kernel.org/doc/Documentation/CodingStyle>`_:
+<https://www.kernel.org/doc/html/latest/process/coding-style.html>`_:
 
 .. code-block:: yaml
 
diff --git a/clang/docs/ClangOffloadBundler.rst b/clang/docs/ClangOffloadBundler.rst
index d08bf4b97781fa..1e21d3e7264d5c 100644
--- a/clang/docs/ClangOffloadBundler.rst
+++ b/clang/docs/ClangOffloadBundler.rst
@@ -309,3 +309,30 @@ target by comparing bundle ID's. Two bundle ID's are considered compatible if:
   * Their offload kind are the same
   * Their target triple are the same
   * Their GPUArch are the same
+
+Compression and Decompression
+=============================
+
+``clang-offload-bundler`` provides features to compress and decompress the full
+bundle, leveraging inherent redundancies within the bundle entries. Use the
+`-compress` command-line option to enable this compression capability.
+
+The compressed offload bundle begins with a header followed by the compressed binary data:
+
+- **Magic Number (4 bytes)**:
+    This is a unique identifier to distinguish compressed offload bundles. The value is the string 'CCOB' (Compressed Clang Offload Bundle).
+
+- **Version Number (16-bit unsigned int)**:
+    This denotes the version of the compressed offload bundle format. The current version is `1`.
+
+- **Compression Method (16-bit unsigned int)**:
+    This field indicates the compression method used. The value corresponds to either `zlib` or `zstd`, represented as a 16-bit unsigned integer cast from the LLVM compression enumeration.
+
+- **Uncompressed Binary Size (32-bit unsigned int)**:
+    This is the size (in bytes) of the binary data before it was compressed.
+
+- **Hash (64-bit unsigned int)**:
+    This is a 64-bit truncated MD5 hash of the uncompressed binary data. It serves for verification and caching purposes.
+
+- **Compressed Data**:
+    The actual compressed binary data follows the header. Its size can be inferred from the total size of the file minus the header size.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8f5a67e14c9aba..ae67a783c2fa2d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -91,6 +91,11 @@ C++20 Feature Support
 
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
+- Implemented `P0847R7: Deducing this <https://wg21.link/P0847R7>`_. Some related core issues were also
+  implemented (`CWG2553 <https://wg21.link/CWG2553>`_, `CWG2554 <https://wg21.link/CWG2554>`_,
+  `CWG2653 <https://wg21.link/CWG2653>`_, `CWG2687 <https://wg21.link/CWG2687>`_). Because the
+  support for this feature is still experimental, the feature test macro ``__cpp_explicit_this_parameter``
+  was not set in this version.
 
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -127,6 +132,11 @@ C Language Changes
 - ``structs``, ``unions``, and ``arrays`` that are const may now be used as
   constant expressions.  This change is more consistent with the behavior of
   GCC.
+- Clang now supports the C-only attribute ``counted_by``. When applied to a
+  struct's flexible array member, it points to the struct field that holds the
+  number of elements in the flexible array member. This information can improve
+  the results of the array bound sanitizer and the
+  ``__builtin_dynamic_object_size`` builtin.
 
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^
@@ -135,12 +145,20 @@ C23 Feature Support
   previous placeholder value. Clang continues to accept ``-std=c2x`` and
   ``-std=gnu2x`` as aliases for C23 and GNU C23, respectively.
 - Clang now supports `requires c23` for module maps.
+- Clang now supports ``N3007 Type inference for object definitions``.
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
 
 New Compiler Flags
 ------------------
+* ``-fverify-intermediate-code`` and its complement ``-fno-verify-intermediate-code``.
+  Enables or disables verification of the generated LLVM IR.
+  Users can pass this to turn on extra verification to catch certain types of
+  compiler bugs at the cost of extra compile time.
+  Since enabling the verifier adds a non-trivial cost of a few percent impact on
+  build times, it's disabled by default, unless your LLVM distribution itself is
+  compiled with runtime checks enabled.
 
 Deprecated Compiler Flags
 -------------------------
@@ -208,6 +226,50 @@ Improvements to Clang's diagnostics
   (`#54678: <https://github.com/llvm/llvm-project/issues/54678>`_).
 - Clang now prints its 'note' diagnostic in cyan instead of black, to be more compatible
   with terminals with dark background colors. This is also more consistent with GCC.
+- The fix-it emitted by ``-Wformat`` for scoped enumerations now take the
+  enumeration's underlying type into account instead of suggesting a type just
+  based on the format string specifier being used.
+- Clang now displays an improved diagnostic and a note when a defaulted special
+  member is marked ``constexpr`` in a class with a virtual base class
+  (`#64843: <https://github.com/llvm/llvm-project/issues/64843>`_).
+- ``-Wfixed-enum-extension`` and ``-Wmicrosoft-fixed-enum`` diagnostics are no longer
+  emitted when building as C23, since C23 standardizes support for enums with a
+  fixed underlying type.
+- When describing the failure of static assertion of `==` expression, clang prints the integer
+  representation of the value as well as its character representation when
+  the user-provided expression is of character type. If the character is
+  non-printable, clang now shows the escpaed character.
+  Clang also prints multi-byte characters if the user-provided expression
+  is of multi-byte character type.
+
+  *Example Code*:
+
+  .. code-block:: c++
+
+     static_assert("A\n"[1] == U'🌍');
+
+  *BEFORE*:
+
+  .. code-block:: text
+
+    source:1:15: error: static assertion failed due to requirement '"A\n"[1] == U'\U0001f30d''
+    1 | static_assert("A\n"[1] == U'🌍');
+      |               ^~~~~~~~~~~~~~~~~
+    source:1:24: note: expression evaluates to ''
+    ' == 127757'
+    1 | static_assert("A\n"[1] == U'🌍');
+      |               ~~~~~~~~~^~~~~~~~
+
+  *AFTER*:
+
+  .. code-block:: text
+
+    source:1:15: error: static assertion failed due to requirement '"A\n"[1] == U'\U0001f30d''
+    1 | static_assert("A\n"[1] == U'🌍');
+      |               ^~~~~~~~~~~~~~~~~
+    source:1:24: note: expression evaluates to ''\n' (0x0A, 10) == U'🌍' (0x1F30D, 127757)'
+    1 | static_assert("A\n"[1] == U'🌍');
+      |               ~~~~~~~~~^~~~~~~~
 
 Bug Fixes in This Version
 -------------------------
@@ -274,6 +336,15 @@ Bug Fixes in This Version
   Fixes (`#67603 <https://github.com/llvm/llvm-project/issues/67603>`_)
 - Fixes a crash caused by a multidimensional array being captured by a lambda
   (`#67722 <https://github.com/llvm/llvm-project/issues/67722>`_).
+- Fixes a crash when instantiating a lambda with requires clause.
+  (`#64462 <https://github.com/llvm/llvm-project/issues/64462>`_)
+- Fixes a regression where the ``UserDefinedLiteral`` was not properly preserved
+  while evaluating consteval functions. (`#63898 <https://github.com/llvm/llvm-project/issues/63898>`_).
+- Fix a crash when evaluating value-dependent structured binding
+  variables at compile time.
+  Fixes (`#67690 <https://github.com/llvm/llvm-project/issues/67690>`_)
+- Fixes a ``clang-17`` regression where ``LLVM_UNREACHABLE_OPTIMIZE=OFF``
+  cannot be used with ``Release`` mode builds. (`#68237 <https://github.com/llvm/llvm-project/issues/68237>`_).
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -366,6 +437,9 @@ Bug Fixes to C++ Support
 - Fix crash caused by a spaceship operator returning a comparision category by
   reference. Fixes:
   (`#64162 <https://github.com/llvm/llvm-project/issues/64162>`_)
+- Fix a crash when calling a consteval function in an expression used as
+  the size of an array.
+  (`#65520 <https://github.com/llvm/llvm-project/issues/65520>`_)
 
 - Clang no longer tries to capture non-odr-used variables that appear
   in the enclosing expression of a lambda expression with a noexcept specifier.
@@ -378,6 +452,15 @@ Bug Fixes to C++ Support
 - Fixed a bug causing destructors of constant-evaluated structured bindings
   initialized by array elements to be called in the wrong evaluation context.
 
+- Fix crash where ill-formed code was being treated as a deduction guide and
+  we now produce a diagnostic. Fixes:
+  (`#65522 <https://github.com/llvm/llvm-project/issues/65522>`_)
+
+- Fixed a bug where clang incorrectly considered implicitly generated deduction
+  guides from a non-templated constructor and a templated constructor as ambiguous,
+  rather than prefer the non-templated constructor as specified in
+  [standard.group]p3.
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
@@ -398,6 +481,8 @@ Miscellaneous Clang Crashes Fixed
   `Issue 64065 <https://github.com/llvm/llvm-project/issues/64065>`_
 - Fixed a crash when check array access on zero-length element.
   `Issue 64564 <https://github.com/llvm/llvm-project/issues/64564>`_
+- Fixed a crash when an ObjC ivar has an invalid type. See
+  (`#68001 <https://github.com/llvm/llvm-project/pull/68001>`_)
 
 Target Specific Changes
 -----------------------
@@ -533,6 +618,14 @@ Static Analyzer
   Read the PR for the details.
   (`#66086 <https://github.com/llvm/llvm-project/pull/66086>`_)
 
+- A few crashes have been found and fixed using randomized testing related
+  to the use of ``_BitInt()`` in tidy checks and in clang analysis. See
+  `#67212 <https://github.com/llvm/llvm-project/pull/67212>`_,
+  `#66782 <https://github.com/llvm/llvm-project/pull/66782>`_,
+  `#65889 <https://github.com/llvm/llvm-project/pull/65889>`_,
+  `#65888 <https://github.com/llvm/llvm-project/pull/65888>`_, and
+  `#65887 <https://github.com/llvm/llvm-project/pull/65887>`_
+
 .. _release-notes-sanitizers:
 
 Sanitizers
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 4ee32c76a95d8e..8ad0514ee2ce22 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -195,20 +195,25 @@ class ASTContext : public RefCountedBase<ASTContext> {
       ConstantArrayTypes;
   mutable llvm::FoldingSet<IncompleteArrayType> IncompleteArrayTypes;
   mutable std::vector<VariableArrayType*> VariableArrayTypes;
-  mutable llvm::FoldingSet<DependentSizedArrayType> DependentSizedArrayTypes;
-  mutable llvm::FoldingSet<DependentSizedExtVectorType>
-    DependentSizedExtVectorTypes;
-  mutable llvm::FoldingSet<DependentAddressSpaceType>
+  mutable llvm::ContextualFoldingSet<DependentSizedArrayType, ASTContext &>
+      DependentSizedArrayTypes;
+  mutable llvm::ContextualFoldingSet<DependentSizedExtVectorType, ASTContext &>
+      DependentSizedExtVectorTypes;
+  mutable llvm::ContextualFoldingSet<DependentAddressSpaceType, ASTContext &>
       DependentAddressSpaceTypes;
   mutable llvm::FoldingSet<VectorType> VectorTypes;
-  mutable llvm::FoldingSet<DependentVectorType> DependentVectorTypes;
+  mutable llvm::ContextualFoldingSet<DependentVectorType, ASTContext &>
+      DependentVectorTypes;
   mutable llvm::FoldingSet<ConstantMatrixType> MatrixTypes;
-  mutable llvm::FoldingSet<DependentSizedMatrixType> DependentSizedMatrixTypes;
+  mutable llvm::ContextualFoldingSet<DependentSizedMatrixType, ASTContext &>
+      DependentSizedMatrixTypes;
   mutable llvm::FoldingSet<FunctionNoProtoType> FunctionNoProtoTypes;
   mutable llvm::ContextualFoldingSet<FunctionProtoType, ASTContext&>
     FunctionProtoTypes;
-  mutable llvm::FoldingSet<DependentTypeOfExprType> DependentTypeOfExprTypes;
-  mutable llvm::FoldingSet<DependentDecltypeType> DependentDecltypeTypes;
+  mutable llvm::ContextualFoldingSet<DependentTypeOfExprType, ASTContext &>
+      DependentTypeOfExprTypes;
+  mutable llvm::ContextualFoldingSet<DependentDecltypeType, ASTContext &>
+      DependentDecltypeTypes;
   mutable llvm::FoldingSet<TemplateTypeParmType> TemplateTypeParmTypes;
   mutable llvm::FoldingSet<ObjCTypeParamType> ObjCTypeParamTypes;
   mutable llvm::FoldingSet<SubstTemplateTypeParmType>
@@ -238,7 +243,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
   mutable llvm::FoldingSet<AttributedType> AttributedTypes;
   mutable llvm::FoldingSet<PipeType> PipeTypes;
   mutable llvm::FoldingSet<BitIntType> BitIntTypes;
-  mutable llvm::FoldingSet<DependentBitIntType> DependentBitIntTypes;
+  mutable llvm::ContextualFoldingSet<DependentBitIntType, ASTContext &>
+      DependentBitIntTypes;
   llvm::FoldingSet<BTFTagAttributedType> BTFTagAttributedTypes;
 
   mutable llvm::FoldingSet<QualifiedTemplateName> QualifiedTemplateNames;
diff --git a/clang/include/clang/AST/ASTLambda.h b/clang/include/clang/AST/ASTLambda.h
index 230e0c848610fe..646cb574847fe2 100644
--- a/clang/include/clang/AST/ASTLambda.h
+++ b/clang/include/clang/AST/ASTLambda.h
@@ -35,6 +35,21 @@ inline bool isLambdaCallOperator(const DeclContext *DC) {
   return isLambdaCallOperator(cast<CXXMethodDecl>(DC));
 }
 
+inline bool isLambdaCallWithExplicitObjectParameter(const DeclContext *DC) {
+  return isLambdaCallOperator(DC) &&
+         cast<CXXMethodDecl>(DC)->isExplicitObjectMemberFunction();
+}
+
+inline bool isLambdaCallWithImplicitObjectParameter(const DeclContext *DC) {
+  return isLambdaCallOperator(DC) &&
+         // FIXME: Checking for a null type is not great
+         // but lambdas with invalid captures or whose closure parameter list
+         // have not fully been parsed may have a call operator whose type is
+         // null.
+         !cast<CXXMethodDecl>(DC)->getType().isNull() &&
+         !cast<CXXMethodDecl>(DC)->isExplicitObjectMemberFunction();
+}
+
 inline bool isGenericLambdaCallOperatorSpecialization(const CXXMethodDecl *MD) {
   if (!MD) return false;
   const CXXRecordDecl *LambdaClass = MD->getParent();
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index c51b33c9e594a8..2830861add39e7 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -1809,6 +1809,18 @@ class ParmVarDecl : public VarDecl {
     ParmVarDeclBits.IsKNRPromoted = promoted;
   }
 
+  bool isExplicitObjectParameter() const {
+    return ExplicitObjectParameterIntroducerLoc.isValid();
+  }
+
+  void setExplicitObjectParameterLoc(SourceLocation Loc) {
+    ExplicitObjectParameterIntroducerLoc = Loc;
+  }
+
+  SourceLocation getExplicitObjectParamThisLoc() const {
+    return ExplicitObjectParameterIntroducerLoc;
+  }
+
   Expr *getDefaultArg();
   const Expr *getDefaultArg() const {
     return const_cast<ParmVarDecl *>(this)->getDefaultArg();
@@ -1875,7 +1887,10 @@ class ParmVarDecl : public VarDecl {
   static bool classofKind(Kind K) { return K == ParmVar; }
 
 private:
+  friend class ASTDeclReader;
+
   enum { ParameterIndexSentinel = (1 << NumParameterIndexBits) - 1 };
+  SourceLocation ExplicitObjectParameterIntroducerLoc;
 
   void setParameterIndex(unsigned parameterIndex) {
     if (parameterIndex >= ParameterIndexSentinel) {
@@ -2640,6 +2655,23 @@ class FunctionDecl : public DeclaratorDecl,
   /// parameters have default arguments (in C++).
   unsigned getMinRequiredArguments() const;
 
+  /// Returns the minimum number of non-object arguments needed to call this
+  /// function. This produces the same value as getMinRequiredArguments except
+  /// it does not count the explicit object argument, if any.
+  unsigned getMinRequiredExplicitArguments() const;
+
+  bool hasCXXExplicitFunctionObjectParameter() const;
+
+  unsigned getNumNonObjectParams() const;
+
+  const ParmVarDecl *getNonObjectParameter(unsigned I) const {
+    return getParamDecl(hasCXXExplicitFunctionObjectParameter() ? I + 1 : I);
+  }
+
+  ParmVarDecl *getNonObjectParameter(unsigned I) {
+    return getParamDecl(hasCXXExplicitFunctionObjectParameter() ? I + 1 : I);
+  }
+
   /// Determine whether this function has a single parameter, or multiple
   /// parameters where all but the first have default arguments.
   ///
@@ -4272,6 +4304,30 @@ class RecordDecl : public TagDecl {
     return field_begin() == field_end();
   }
 
+  FieldDecl *getLastField() {
+    FieldDecl *FD = nullptr;
+    for (FieldDecl *Field : fields())
+      FD = Field;
+    return FD;
+  }
+  const FieldDecl *getLastField() const {
+    return const_cast<RecordDecl *>(this)->getLastField();
+  }
+
+  template <typename Functor>
+  const FieldDecl *findFieldIf(Functor &Pred) const {
+    for (const Decl *D : decls()) {
+      if (const auto *FD = dyn_cast<FieldDecl>(D); FD && Pred(FD))
+        return FD;
+
+      if (const auto *RD = dyn_cast<RecordDecl>(D))
+        if (const FieldDecl *FD = RD->findFieldIf(Pred))
+          return FD;
+    }
+
+    return nullptr;
+  }
+
   /// Note that the definition of this type is now complete.
   virtual void completeDefinition();
 
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 12137387b676a7..d383e46e22e16f 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -18,6 +18,7 @@
 #include "clang/AST/DeclarationName.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -477,6 +478,15 @@ class alignas(8) Decl {
   // Return true if this is a FileContext Decl.
   bool isFileContextDecl() const;
 
+  /// Whether it resembles a flexible array member. This is a static member
+  /// because we want to be able to call it with a nullptr. That allows us to
+  /// perform non-Decl specific checks based on the object's type and strict
+  /// flex array level.
+  static bool isFlexibleArrayMemberLike(
+      ASTContext &Context, const Decl *D, QualType Ty,
+      LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel,
+      bool IgnoreTemplateOrMacroSubstitution);
+
   ASTContext &getASTContext() const LLVM_READONLY;
 
   /// Helper to get the language options from the ASTContext.
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index afec8150c2c9c6..aa3e3322faa42e 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -2061,6 +2061,17 @@ class CXXMethodDecl : public FunctionDecl {
   bool isStatic() const;
   bool isInstance() const { return !isStatic(); }
 
+  /// [C++2b][dcl.fct]/p7
+  /// An explicit object member function is a non-static
+  /// member function with an explicit object parameter. e.g.,
+  ///   void func(this SomeType);
+  bool isExplicitObjectMemberFunction() const;
+
+  /// [C++2b][dcl.fct]/p7
+  /// An implicit object member function is a non-static
+  /// member function without an explicit object parameter.
+  bool isImplicitObjectMemberFunction() const;
+
   /// Returns true if the given operator is implicitly static in a record
   /// context.
   static bool isStaticOverloadedOperator(OverloadedOperatorKind OOK) {
@@ -2169,14 +2180,19 @@ class CXXMethodDecl : public FunctionDecl {
   /// Return the type of the object pointed by \c this.
   ///
   /// See getThisType() for usage restriction.
-  QualType getThisObjectType() const;
+
+  QualType getFunctionObjectParameterReferenceType() const;
+  QualType getFunctionObjectParameterType() const {
+    return getFunctionObjectParameterReferenceType().getNonReferenceType();
+  }
+
+  unsigned getNumExplicitParams() const {
+    return getNumParams() - (isExplicitObjectMemberFunction() ? 1 : 0);
+  }
 
   static QualType getThisType(const FunctionProtoType *FPT,
                               const CXXRecordDecl *Decl);
 
-  static QualType getThisObjectType(const FunctionProtoType *FPT,
-                                    const CXXRecordDecl *Decl);
-
   Qualifiers getMethodQualifiers() const {
     return getType()->castAs<FunctionProtoType>()->getMethodQuals();
   }
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index fcda738ef4c552..b69c616b009036 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -1449,6 +1449,16 @@ class DeclRefExpr final
     DeclRefExprBits.IsImmediateEscalating = Set;
   }
 
+  bool isCapturedByCopyInLambdaWithExplicitObjectParameter() const {
+    return DeclRefExprBits.CapturedByCopyInLambdaWithExplicitObjectParameter;
+  }
+
+  void setCapturedByCopyInLambdaWithExplicitObjectParameter(
+      bool Set, const ASTContext &Context) {
+    DeclRefExprBits.CapturedByCopyInLambdaWithExplicitObjectParameter = Set;
+    setDependence(computeDependence(this, Context));
+  }
+
   static bool classof(const Stmt *T) {
     return T->getStmtClass() == DeclRefExprClass;
   }
diff --git a/clang/include/clang/AST/Mangle.h b/clang/include/clang/AST/Mangle.h
index c04bcc7f01cb4e..e586b0cec43dfd 100644
--- a/clang/include/clang/AST/Mangle.h
+++ b/clang/include/clang/AST/Mangle.h
@@ -178,8 +178,8 @@ class MangleContext {
   /// or type uniquing.
   /// TODO: Extend this to internal types by generating names that are unique
   /// across translation units so it can be used with LTO.
-  virtual void mangleTypeName(QualType T, raw_ostream &,
-                              bool NormalizeIntegers = false) = 0;
+  virtual void mangleCanonicalTypeName(QualType T, raw_ostream &,
+                                       bool NormalizeIntegers = false) = 0;
 
   /// @}
 };
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 31ae3d42e232fc..eeeca1998f9fa9 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -9220,6 +9220,27 @@ class OMPXAttributeClause
   }
 };
 
+/// This represents 'ompx_bare' clause in the '#pragma omp target teams ...'
+/// directive.
+///
+/// \code
+/// #pragma omp target teams ompx_bare
+/// \endcode
+/// In this example directive '#pragma omp target teams' has a 'ompx_bare'
+/// clause.
+class OMPXBareClause : public OMPNoChildClause<llvm::omp::OMPC_ompx_bare> {
+public:
+  /// Build 'ompx_bare' clause.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc Ending location of the clause.
+  OMPXBareClause(SourceLocation StartLoc, SourceLocation EndLoc)
+      : OMPNoChildClause(StartLoc, EndLoc) {}
+
+  /// Build an empty clause.
+  OMPXBareClause() = default;
+};
+
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_OPENMPCLAUSE_H
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index d4146d52893ffb..298489e7d4fc41 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3890,6 +3890,11 @@ bool RecursiveASTVisitor<Derived>::VisitOMPXAttributeClause(
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPXBareClause(OMPXBareClause *C) {
+  return true;
+}
+
 // FIXME: look at the following tricky-seeming exprs to see if we
 // need to recurse on anything.  These are ones that have methods
 // returning decls or qualtypes or nestednamespecifier -- though I'm
diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index 87ffebc00d7b79..69a764480ed568 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -383,6 +383,7 @@ class alignas(void *) Stmt {
     unsigned HasFoundDecl : 1;
     unsigned HadMultipleCandidates : 1;
     unsigned RefersToEnclosingVariableOrCapture : 1;
+    unsigned CapturedByCopyInLambdaWithExplicitObjectParameter : 1;
     unsigned NonOdrUseReason : 2;
     unsigned IsImmediateEscalating : 1;
 
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 4799f89db82fa7..a78d8f60462b23 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -3289,8 +3289,6 @@ class VariableArrayType : public ArrayType {
 class DependentSizedArrayType : public ArrayType {
   friend class ASTContext; // ASTContext creates these.
 
-  const ASTContext &Context;
-
   /// An assignment expression that will instantiate to the
   /// size of the array.
   ///
@@ -3301,8 +3299,8 @@ class DependentSizedArrayType : public ArrayType {
   /// The range spanned by the left and right array brackets.
   SourceRange Brackets;
 
-  DependentSizedArrayType(const ASTContext &Context, QualType et, QualType can,
-                          Expr *e, ArraySizeModifier sm, unsigned tq,
+  DependentSizedArrayType(QualType et, QualType can, Expr *e,
+                          ArraySizeModifier sm, unsigned tq,
                           SourceRange brackets);
 
 public:
@@ -3325,7 +3323,7 @@ class DependentSizedArrayType : public ArrayType {
     return T->getTypeClass() == DependentSizedArray;
   }
 
-  void Profile(llvm::FoldingSetNodeID &ID) {
+  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
     Profile(ID, Context, getElementType(),
             getSizeModifier(), getIndexTypeCVRQualifiers(), getSizeExpr());
   }
@@ -3349,14 +3347,12 @@ class DependentSizedArrayType : public ArrayType {
 class DependentAddressSpaceType : public Type, public llvm::FoldingSetNode {
   friend class ASTContext;
 
-  const ASTContext &Context;
   Expr *AddrSpaceExpr;
   QualType PointeeType;
   SourceLocation loc;
 
-  DependentAddressSpaceType(const ASTContext &Context, QualType PointeeType,
-                            QualType can, Expr *AddrSpaceExpr,
-                            SourceLocation loc);
+  DependentAddressSpaceType(QualType PointeeType, QualType can,
+                            Expr *AddrSpaceExpr, SourceLocation loc);
 
 public:
   Expr *getAddrSpaceExpr() const { return AddrSpaceExpr; }
@@ -3370,7 +3366,7 @@ class DependentAddressSpaceType : public Type, public llvm::FoldingSetNode {
     return T->getTypeClass() == DependentAddressSpace;
   }
 
-  void Profile(llvm::FoldingSetNodeID &ID) {
+  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
     Profile(ID, Context, getPointeeType(), getAddrSpaceExpr());
   }
 
@@ -3391,7 +3387,6 @@ class DependentAddressSpaceType : public Type, public llvm::FoldingSetNode {
 class DependentSizedExtVectorType : public Type, public llvm::FoldingSetNode {
   friend class ASTContext;
 
-  const ASTContext &Context;
   Expr *SizeExpr;
 
   /// The element type of the array.
@@ -3399,8 +3394,8 @@ class DependentSizedExtVectorType : public Type, public llvm::FoldingSetNode {
 
   SourceLocation loc;
 
-  DependentSizedExtVectorType(const ASTContext &Context, QualType ElementType,
-                              QualType can, Expr *SizeExpr, SourceLocation loc);
+  DependentSizedExtVectorType(QualType ElementType, QualType can,
+                              Expr *SizeExpr, SourceLocation loc);
 
 public:
   Expr *getSizeExpr() const { return SizeExpr; }
@@ -3414,7 +3409,7 @@ class DependentSizedExtVectorType : public Type, public llvm::FoldingSetNode {
     return T->getTypeClass() == DependentSizedExtVector;
   }
 
-  void Profile(llvm::FoldingSetNodeID &ID) {
+  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
     Profile(ID, Context, getElementType(), getSizeExpr());
   }
 
@@ -3513,14 +3508,12 @@ class VectorType : public Type, public llvm::FoldingSetNode {
 class DependentVectorType : public Type, public llvm::FoldingSetNode {
   friend class ASTContext;
 
-  const ASTContext &Context;
   QualType ElementType;
   Expr *SizeExpr;
   SourceLocation Loc;
 
-  DependentVectorType(const ASTContext &Context, QualType ElementType,
-                           QualType CanonType, Expr *SizeExpr,
-                           SourceLocation Loc, VectorType::VectorKind vecKind);
+  DependentVectorType(QualType ElementType, QualType CanonType, Expr *SizeExpr,
+                      SourceLocation Loc, VectorType::VectorKind vecKind);
 
 public:
   Expr *getSizeExpr() const { return SizeExpr; }
@@ -3537,7 +3530,7 @@ class DependentVectorType : public Type, public llvm::FoldingSetNode {
     return T->getTypeClass() == DependentVector;
   }
 
-  void Profile(llvm::FoldingSetNodeID &ID) {
+  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
     Profile(ID, Context, getElementType(), getSizeExpr(), getVectorKind());
   }
 
@@ -3719,15 +3712,13 @@ class ConstantMatrixType final : public MatrixType {
 class DependentSizedMatrixType final : public MatrixType {
   friend class ASTContext;
 
-  const ASTContext &Context;
   Expr *RowExpr;
   Expr *ColumnExpr;
 
   SourceLocation loc;
 
-  DependentSizedMatrixType(const ASTContext &Context, QualType ElementType,
-                           QualType CanonicalType, Expr *RowExpr,
-                           Expr *ColumnExpr, SourceLocation loc);
+  DependentSizedMatrixType(QualType ElementType, QualType CanonicalType,
+                           Expr *RowExpr, Expr *ColumnExpr, SourceLocation loc);
 
 public:
   Expr *getRowExpr() const { return RowExpr; }
@@ -3738,7 +3729,7 @@ class DependentSizedMatrixType final : public MatrixType {
     return T->getTypeClass() == DependentSizedMatrix;
   }
 
-  void Profile(llvm::FoldingSetNodeID &ID) {
+  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
     Profile(ID, Context, getElementType(), getRowExpr(), getColumnExpr());
   }
 
@@ -4749,15 +4740,12 @@ class TypeOfExprType : public Type {
 /// This class is used internally by the ASTContext to manage
 /// canonical, dependent types, only. Clients will only see instances
 /// of this class via TypeOfExprType nodes.
-class DependentTypeOfExprType
-  : public TypeOfExprType, public llvm::FoldingSetNode {
-  const ASTContext &Context;
-
+class DependentTypeOfExprType : public TypeOfExprType,
+                                public llvm::FoldingSetNode {
 public:
-  DependentTypeOfExprType(const ASTContext &Context, Expr *E, TypeOfKind Kind)
-      : TypeOfExprType(E, Kind), Context(Context) {}
+  DependentTypeOfExprType(Expr *E, TypeOfKind Kind) : TypeOfExprType(E, Kind) {}
 
-  void Profile(llvm::FoldingSetNodeID &ID) {
+  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
     Profile(ID, Context, getUnderlyingExpr(),
             getKind() == TypeOfKind::Unqualified);
   }
@@ -4833,12 +4821,10 @@ class DecltypeType : public Type {
 /// canonical, dependent types, only. Clients will only see instances
 /// of this class via DecltypeType nodes.
 class DependentDecltypeType : public DecltypeType, public llvm::FoldingSetNode {
-  const ASTContext &Context;
-
 public:
-  DependentDecltypeType(const ASTContext &Context, Expr *E);
+  DependentDecltypeType(Expr *E, QualType UnderlyingTpe);
 
-  void Profile(llvm::FoldingSetNodeID &ID) {
+  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
     Profile(ID, Context, getUnderlyingExpr());
   }
 
@@ -6657,12 +6643,10 @@ class BitIntType final : public Type, public llvm::FoldingSetNode {
 
 class DependentBitIntType final : public Type, public llvm::FoldingSetNode {
   friend class ASTContext;
-  const ASTContext &Context;
   llvm::PointerIntPair<Expr*, 1, bool> ExprAndUnsigned;
 
 protected:
-  DependentBitIntType(const ASTContext &Context, bool IsUnsigned,
-                      Expr *NumBits);
+  DependentBitIntType(bool IsUnsigned, Expr *NumBits);
 
 public:
   bool isUnsigned() const;
@@ -6672,7 +6656,7 @@ class DependentBitIntType final : public Type, public llvm::FoldingSetNode {
   bool isSugared() const { return false; }
   QualType desugar() const { return QualType(this, 0); }
 
-  void Profile(llvm::FoldingSetNodeID &ID) {
+  void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) {
     Profile(ID, Context, isUnsigned(), getNumBitsExpr());
   }
   static void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context,
diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafety.h b/clang/include/clang/Analysis/Analyses/ThreadSafety.h
index 1808d1d71e05d2..0866b09bab2995 100644
--- a/clang/include/clang/Analysis/Analyses/ThreadSafety.h
+++ b/clang/include/clang/Analysis/Analyses/ThreadSafety.h
@@ -47,7 +47,13 @@ enum ProtectedOperationKind {
   POK_PassByRef,
 
   /// Passing a pt-guarded variable by reference.
-  POK_PtPassByRef
+  POK_PtPassByRef,
+
+  /// Returning a guarded variable by reference.
+  POK_ReturnByRef,
+
+  /// Returning a pt-guarded variable by reference.
+  POK_PtReturnByRef,
 };
 
 /// This enum distinguishes between different kinds of lock actions. For
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 7a6ec77ae84b15..5c9eb7b8a98103 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4246,3 +4246,21 @@ def AvailableOnlyInDefaultEvalMethod : InheritableAttr {
   let Subjects = SubjectList<[TypedefName], ErrorDiag>;
   let Documentation = [Undocumented];
 }
+
+def CountedBy : InheritableAttr {
+  let Spellings = [Clang<"counted_by">];
+  let Subjects = SubjectList<[Field]>;
+  let Args = [IdentifierArgument<"CountedByField">];
+  let Documentation = [CountedByDocs];
+  let LangOpts = [COnly];
+  // FIXME: This is ugly. Let using a DeclArgument would be nice, but a Decl
+  // isn't yet available due to the fact that we're still parsing the
+  // structure. Maybe that code could be changed sometime in the future.
+  code AdditionalMembers = [{
+    private:
+      SourceRange CountedByFieldLoc;
+    public:
+      SourceRange getCountedByFieldLoc() const { return CountedByFieldLoc; }
+      void setCountedByFieldLoc(SourceRange Loc) { CountedByFieldLoc = Loc; }
+  }];
+}
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 8d928dcc146b25..9f9991bdae3615 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7275,3 +7275,69 @@ relative ordering of values is important. For example:
 attribute, they default to the value ``65535``.
 }];
 }
+
+def CountedByDocs : Documentation {
+  let Category = DocCatField;
+  let Content = [{
+Clang supports the ``counted_by`` attribute on the flexible array member of a
+structure in C. The argument for the attribute is the name of a field member in
+the same structure holding the count of elements in the flexible array. This
+information can be used to improve the results of the array bound sanitizer and
+the ``__builtin_dynamic_object_size`` builtin.
+
+For example, the following code:
+
+.. code-block:: c
+
+  struct bar;
+
+  struct foo {
+    size_t count;
+    char other;
+    struct bar *array[] __attribute__((counted_by(count)));
+  };
+
+specifies that the flexible array member ``array`` has the number of elements
+allocated for it stored in ``count``. This establishes a relationship between
+``array`` and ``count``. Specifically, ``p->array`` must have at least
+``p->count`` number of elements available. It's the user's responsibility to
+ensure that this relationship is maintained through changes to the structure.
+
+In the following example, the allocated array erroneously has fewer elements
+than what's specified by ``p->count``. This would result in an out-of-bounds
+access not being detected.
+
+.. code-block:: c
+
+  #define SIZE_INCR 42
+
+  struct foo *p;
+
+  void foo_alloc(size_t count) {
+    p = malloc(MAX(sizeof(struct foo),
+                   offsetof(struct foo, array[0]) + count * sizeof(struct bar *)));
+    p->count = count + SIZE_INCR;
+  }
+
+The next example updates ``p->count``, breaking the relationship requirement
+that ``p->array`` must have at least ``p->count`` number of elements available:
+
+.. code-block:: c
+
+  #define SIZE_INCR 42
+
+  struct foo *p;
+
+  void foo_alloc(size_t count) {
+    p = malloc(MAX(sizeof(struct foo),
+                   offsetof(struct foo, array[0]) + count * sizeof(struct bar *)));
+    p->count = count;
+  }
+
+  void use_foo(int index) {
+    p->count += SIZE_INCR + 1; /* 'count' is now larger than the number of elements of 'array'. */
+    p->array[index] = 0;       /* the sanitizer can't properly check if this is an out-of-bounds access. */
+  }
+
+  }];
+}
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 6bbbc4bbdd75c6..e4802f8ab1c156 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -233,8 +233,6 @@ TARGET_BUILTIN(__builtin_ia32_minps, "V4fV4fV4f", "ncV:128:", "sse")
 TARGET_BUILTIN(__builtin_ia32_maxps, "V4fV4fV4f", "ncV:128:", "sse")
 TARGET_BUILTIN(__builtin_ia32_minss, "V4fV4fV4f", "ncV:128:", "sse")
 TARGET_BUILTIN(__builtin_ia32_maxss, "V4fV4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cmpps, "V4fV4fV4fIc", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cmpss, "V4fV4fV4fIc", "ncV:128:", "sse")
 
 TARGET_BUILTIN(__builtin_ia32_cmpeqpd, "V2dV2dV2d", "ncV:128:", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpltpd, "V2dV2dV2d", "ncV:128:", "sse2")
@@ -252,8 +250,6 @@ TARGET_BUILTIN(__builtin_ia32_cmpneqsd, "V2dV2dV2d", "ncV:128:", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpnltsd, "V2dV2dV2d", "ncV:128:", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpnlesd, "V2dV2dV2d", "ncV:128:", "sse2")
 TARGET_BUILTIN(__builtin_ia32_cmpordsd, "V2dV2dV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cmpsd, "V2dV2dV2dIc", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cmppd, "V2dV2dV2dIc", "ncV:128:", "sse2")
 TARGET_BUILTIN(__builtin_ia32_minpd, "V2dV2dV2d", "ncV:128:", "sse2")
 TARGET_BUILTIN(__builtin_ia32_maxpd, "V2dV2dV2d", "ncV:128:", "sse2")
 TARGET_BUILTIN(__builtin_ia32_minsd, "V2dV2dV2d", "ncV:128:", "sse2")
@@ -473,8 +469,12 @@ TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "ncV:256:", "avx")
 TARGET_BUILTIN(__builtin_ia32_shufpd256, "V4dV4dV4dIi", "ncV:256:", "avx")
 TARGET_BUILTIN(__builtin_ia32_shufps256, "V8fV8fV8fIi", "ncV:256:", "avx")
 TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "ncV:256:", "avx")
+TARGET_BUILTIN(__builtin_ia32_cmppd, "V2dV2dV2dIc", "ncV:128:", "avx")
 TARGET_BUILTIN(__builtin_ia32_cmppd256, "V4dV4dV4dIc", "ncV:256:", "avx")
+TARGET_BUILTIN(__builtin_ia32_cmpps, "V4fV4fV4fIc", "ncV:128:", "avx")
 TARGET_BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "ncV:256:", "avx")
+TARGET_BUILTIN(__builtin_ia32_cmpsd, "V2dV2dV2dIc", "ncV:128:", "avx")
+TARGET_BUILTIN(__builtin_ia32_cmpss, "V4fV4fV4fIc", "ncV:128:", "avx")
 TARGET_BUILTIN(__builtin_ia32_vextractf128_pd256, "V2dV4dIi", "ncV:256:", "avx")
 TARGET_BUILTIN(__builtin_ia32_vextractf128_ps256, "V4fV8fIi", "ncV:256:", "avx")
 TARGET_BUILTIN(__builtin_ia32_vextractf128_si256, "V4iV8iIi", "ncV:256:", "avx")
diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index 5606a22fe9d68b..3df037b793b394 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -1840,7 +1840,7 @@ const char ToggleHighlight = 127;
 void ProcessWarningOptions(DiagnosticsEngine &Diags,
                            const DiagnosticOptions &Opts,
                            bool ReportDiags = true);
-
+void EscapeStringForDiagnostic(StringRef Str, SmallVectorImpl<char> &OutStr);
 } // namespace clang
 
 #endif // LLVM_CLANG_BASIC_DIAGNOSTIC_H
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 2a48c063e243ee..91a95def4f80de 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -70,6 +70,16 @@ def err_drv_no_rocm_device_lib : Error<
 def err_drv_no_hip_runtime : Error<
   "cannot find HIP runtime; provide its path via '--rocm-path', or pass "
   "'-nogpuinc' to build without HIP runtime">;
+def err_drv_no_hipstdpar_lib : Error<
+  "cannot find HIP Standard Parallelism Acceleration library; provide it via "
+  "'--hipstdpar-path'">;
+def err_drv_no_hipstdpar_thrust_lib : Error<
+  "cannot find rocThrust, which is required by the HIP Standard Parallelism "
+  "Acceleration library; provide it via "
+  "'--hipstdpar-thrust-path'">;
+def err_drv_no_hipstdpar_prim_lib : Error<
+  "cannot find rocPrim, which is required by the HIP Standard Parallelism "
+  "Acceleration library; provide it via '--hipstdpar-prim-path'">;
 
 def err_drv_no_hipspv_device_lib : Error<
   "cannot find HIP device library%select{| for %1}0; provide its path via "
diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index bf4995175ef196..06ef1c6904c31d 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -31,7 +31,7 @@ namespace clang {
     // Size of each of the diagnostic categories.
     enum {
       DIAG_SIZE_COMMON        =  300,
-      DIAG_SIZE_DRIVER        =  300,
+      DIAG_SIZE_DRIVER        =  400,
       DIAG_SIZE_FRONTEND      =  150,
       DIAG_SIZE_SERIALIZATION =  120,
       DIAG_SIZE_LEX           =  400,
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 178761bdcf4d5e..43a5f000eda6cb 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1360,6 +1360,8 @@ def warn_clause_expected_string : Warning<
   "expected string literal in 'clause %0' - ignoring">, InGroup<IgnoredPragmas>;
 def err_omp_unexpected_clause : Error<
   "unexpected OpenMP clause '%0' in directive '#pragma omp %1'">;
+def err_omp_unexpected_clause_extension_only : Error<
+  "OpenMP clause '%0' is only available as extension, use '-fopenmp-extensions'">;
 def err_omp_immediate_directive : Error<
   "'#pragma omp %0' %select{|with '%2' clause }1cannot be an immediate substatement">;
 def err_omp_expected_identifier_for_critical : Error<
@@ -1452,6 +1454,8 @@ def warn_unknown_declare_variant_isa_trait
               "spelling or consider restricting the context selector with the "
               "'arch' selector further">,
       InGroup<SourceUsesOpenMP>;
+def note_ompx_bare_clause : Note<
+  "OpenMP extension clause '%0' only allowed with '#pragma omp %1'">;
 def note_omp_declare_variant_ctx_options
     : Note<"context %select{set|selector|property}0 options are: %1">;
 def warn_omp_declare_variant_expected
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 29362df6836535..e3cd49bcc9ecc6 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -240,6 +240,10 @@ def ext_imaginary_constant : Extension<
   "imaginary constants are a GNU extension">, InGroup<GNUImaginaryConstant>;
 def ext_integer_complex : Extension<
   "complex integer types are a GNU extension">, InGroup<GNUComplexInteger>;
+def ext_c23_auto_non_plain_identifier : Extension<
+  "type inference of a declaration other than a plain identifier with optional "
+  "trailing attributes is a Clang extension">,
+  InGroup<DiagGroup<"auto-decl-extensions">>;
 
 def err_invalid_saturation_spec : Error<"'_Sat' specifier is only valid on "
   "'_Fract' or '_Accum', not '%0'">;
@@ -2388,7 +2392,8 @@ def err_auto_not_allowed : Error<
   "|in conversion function type|here|in lambda parameter"
   "|in type allocated by 'new'|in K&R-style function parameter"
   "|in template parameter|in friend declaration|in function prototype that is "
-  "not a function declaration|in requires expression parameter}1">;
+  "not a function declaration|in requires expression parameter"
+  "|in array declaration}1">;
 def err_dependent_deduced_tst : Error<
   "typename specifier refers to "
   "%select{class template|function template|variable template|alias template|"
@@ -2461,7 +2466,8 @@ def err_implied_std_initializer_list_not_found : Error<
 def err_malformed_std_initializer_list : Error<
   "std::initializer_list must be a class template with a single type parameter">;
 def err_auto_init_list_from_c : Error<
-  "cannot use __auto_type with initializer list in C">;
+  "cannot use %select{'auto'|<ERROR>|'__auto_type'}0 with "
+  "%select{initializer list|array}1 in C">;
 def err_auto_bitfield : Error<
   "cannot pass bit-field as __auto_type initializer in C">;
 
@@ -3858,7 +3864,7 @@ def warn_fun_requires_negative_cap : Warning<
   "calling function %0 requires negative capability '%1'">,
   InGroup<ThreadSafetyAnalysis>, DefaultIgnore;
 
-// Thread safety warnings on pass by reference
+// Thread safety warnings on pass/return by reference
 def warn_guarded_pass_by_reference : Warning<
   "passing variable %1 by reference requires holding %0 "
   "%select{'%2'|'%2' exclusively}3">,
@@ -3867,6 +3873,14 @@ def warn_pt_guarded_pass_by_reference : Warning<
   "passing the value that %1 points to by reference requires holding %0 "
   "%select{'%2'|'%2' exclusively}3">,
   InGroup<ThreadSafetyReference>, DefaultIgnore;
+def warn_guarded_return_by_reference : Warning<
+  "returning variable %1 by reference requires holding %0 "
+  "%select{'%2'|'%2' exclusively}3">,
+  InGroup<ThreadSafetyReference>, DefaultIgnore;
+def warn_pt_guarded_return_by_reference : Warning<
+  "returning the value that %1 points to by reference requires holding %0 "
+  "%select{'%2'|'%2' exclusively}3">,
+  InGroup<ThreadSafetyReference>, DefaultIgnore;
 
 // Imprecise thread safety warnings
 def warn_variable_requires_lock : Warning<
@@ -4681,12 +4695,14 @@ def note_ovl_candidate_non_deduced_mismatch_qualified : Note<
 // Note that we don't treat templates differently for this diagnostic.
 def note_ovl_candidate_arity : Note<"candidate "
     "%sub{select_ovl_candidate_kind}0,1,2 not viable: "
-    "requires%select{ at least| at most|}3 %4 argument%s4, but %5 "
+    "requires%select{ at least| at most|}3 %4 "
+    "%select{|non-object }6argument%s4, but %5 "
     "%plural{1:was|:were}5 provided">;
 
 def note_ovl_candidate_arity_one : Note<"candidate "
     "%sub{select_ovl_candidate_kind}0,1,2 not viable: "
     "%select{requires at least|allows at most single|requires single}3 "
+    "%select{|non-object }6"
     "argument %4, but %plural{0:no|:%5}5 arguments were provided">;
 
 def note_ovl_candidate_deleted : Note<
@@ -4869,7 +4885,7 @@ def err_ovl_deleted_object_call : Error<
   "call to deleted function call operator in type %0">;
 def note_ovl_surrogate_cand : Note<"conversion candidate of type %0">;
 def err_member_call_without_object : Error<
-  "call to non-static member function without an object argument">;
+  "call to %select{non-static|explicit}0 member function without an object argument">;
 
 // C++ Address of Overloaded Function
 def err_addr_ovl_no_viable : Error<
@@ -5414,6 +5430,8 @@ def note_constraint_normalization_here : Note<
 def note_parameter_mapping_substitution_here : Note<
   "while substituting into concept arguments here; substitution failures not "
   "allowed in concept arguments">;
+def note_building_deduction_guide_here : Note<
+  "while building implicit deduction guide first needed here">;
 def note_lambda_substitution_here : Note<
   "while substituting into a lambda expression here">;
 def note_instantiation_contexts_suppressed : Note<
@@ -6379,6 +6397,21 @@ def warn_superclass_variable_sized_type_not_at_end : Warning<
   "field %0 can overwrite instance variable %1 with variable sized type %2"
   " in superclass %3">, InGroup<ObjCFlexibleArray>;
 
+def err_counted_by_attr_not_on_flexible_array_member : Error<
+  "'counted_by' only applies to flexible array members">;
+def err_flexible_array_counted_by_attr_field_not_found : Error<
+  "field %0 in 'counted_by' not found">;
+def err_flexible_array_counted_by_attr_field_not_found_suggest : Error<
+  "field %0 in 'counted_by' not found; did you mean %1?">;
+def err_flexible_array_counted_by_attr_field_not_found_in_struct : Error<
+  "field %0 in 'counted_by' is not found in struct">;
+def err_flexible_array_counted_by_attr_refers_to_self : Error<
+  "field %0 in 'counted_by' cannot refer to the flexible array">;
+def err_flexible_array_counted_by_attr_field_not_integer : Error<
+  "field %0 in 'counted_by' is not a non-boolean integer type">;
+def note_flexible_array_counted_by_attr_field : Note<
+  "field %0 declared here">;
+
 let CategoryName = "ARC Semantic Issue" in {
 
 // ARC-mode diagnostics.
@@ -6645,8 +6678,8 @@ def err_func_def_incomplete_result : Error<
 def err_atomic_specifier_bad_type
     : Error<"_Atomic cannot be applied to "
             "%select{incomplete |array |function |reference |atomic |qualified "
-            "|sizeless ||integer }0type "
-            "%1 %select{|||||||which is not trivially copyable|}0">;
+            "|sizeless ||integer |}0type "
+            "%1 %select{|||||||which is not trivially copyable||in C23}0">;
 def warn_atomic_member_access : Warning<
   "accessing a member of an atomic structure or union is undefined behavior">,
   InGroup<DiagGroup<"atomic-access">>, DefaultError;
@@ -7289,14 +7322,16 @@ def note_logical_not_silence_with_parens : Note<
   "add parentheses around left hand side expression to silence this warning">;
 
 def err_invalid_this_use : Error<
-  "invalid use of 'this' outside of a non-static member function">;
+  "invalid use of 'this' %select{outside of a non-static member function"
+  "|in a function with an explicit object parameter}0">;
 def err_this_static_member_func : Error<
   "'this' cannot be%select{| implicitly}0 used in a static member function "
   "declaration">;
-def err_invalid_member_use_in_static_method : Error<
-  "invalid use of member %0 in static member function">;
+def err_invalid_member_use_in_method : Error<
+  "invalid use of member %0 in %select{static|explicit object}1 member function">;
+
 def err_invalid_qualified_function_type : Error<
-  "%select{non-member function|static member function|deduction guide}0 "
+  "%select{non-member function|static member function|explicit object member function|deduction guide}0 "
   "%select{of type %2 |}1cannot have '%3' qualifier">;
 def err_compound_qualified_function_type : Error<
   "%select{block pointer|pointer|reference}0 to function type %select{%2 |}1"
@@ -7304,6 +7339,26 @@ def err_compound_qualified_function_type : Error<
 def err_qualified_function_typeid : Error<
   "type operand %0 of 'typeid' cannot have '%1' qualifier">;
 
+def err_cxx20_deducing_this  : Error<
+  "explicit object parameters are incompatible with C++ standards before C++2b">;
+def err_explicit_object_default_arg: Error<
+  "the explicit object parameter cannot have a default argument">;
+def err_explicit_object_parameter_pack: Error<
+  "the explicit object parameter cannot be a function parameter pack">;
+def err_explicit_object_parameter_must_be_first: Error<
+  "an explicit object parameter can only appear as the first parameter "
+  "of the %select{function|lambda}0">;
+def err_explicit_object_parameter_nonmember: Error<
+  "an explicit object parameter cannot appear in a "
+  "%select{static|virtual|non-member}0 %select{function|lambda}1">;
+def err_explicit_object_parameter_constructor: Error<
+  "an explicit object parameter cannot appear in a %select{constructor|destructor}0">;
+def err_explicit_object_parameter_mutable: Error<
+  "a lambda with an explicit object parameter cannot be mutable">;
+def err_invalid_explicit_object_type_in_lambda: Error<
+  "invalid explicit object parameter type %0 in lambda with capture; "
+  "the type must be the same as, or derived from, the lambda">;
+
 def err_ref_qualifier_overload : Error<
   "cannot overload a member function %select{without a ref-qualifier|with "
   "ref-qualifier '&'|with ref-qualifier '&&'}0 with a member function %select{"
@@ -8525,53 +8580,65 @@ def err_call_function_incomplete_return : Error<
 def err_call_incomplete_argument : Error<
   "argument type %0 is incomplete">;
 def err_typecheck_call_too_few_args : Error<
-  "too few %select{|||execution configuration }0arguments to "
+  "too few %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
   "expected %1, have %2">;
 def err_typecheck_call_too_few_args_one : Error<
-  "too few %select{|||execution configuration }0arguments to "
+  "too few %select{|||execution configuration }0"
+  "%select{|non-object }2arguments to "
   "%select{function|block|method|kernel function}0 call, "
   "single argument %1 was not specified">;
 def err_typecheck_call_too_few_args_at_least : Error<
-  "too few %select{|||execution configuration }0arguments to "
+  "too few %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
   "expected at least %1, have %2">;
 def err_typecheck_call_too_few_args_at_least_one : Error<
-  "too few %select{|||execution configuration }0arguments to "
+  "too few %select{|||execution configuration }0"
+  "%select{|non-object }2arguments to "
   "%select{function|block|method|kernel function}0 call, "
   "at least argument %1 must be specified">;
 def err_typecheck_call_too_few_args_suggest : Error<
-  "too few %select{|||execution configuration }0arguments to "
+  "too few %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
-  "expected %1, have %2; did you mean %3?">;
+  "expected %1, have %2; did you mean %4?">;
 def err_typecheck_call_too_few_args_at_least_suggest : Error<
-  "too few %select{|||execution configuration }0arguments to "
+  "too few %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
-  "expected at least %1, have %2; did you mean %3?">;
+  "expected at least %1, have %2; did you mean %4?">;
 def err_typecheck_call_too_many_args : Error<
-  "too many %select{|||execution configuration }0arguments to "
+  "too many %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
   "expected %1, have %2">;
 def err_typecheck_call_too_many_args_one : Error<
-  "too many %select{|||execution configuration }0arguments to "
+  "too many %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
   "expected single argument %1, have %2 arguments">;
 def err_typecheck_call_too_many_args_at_most : Error<
-  "too many %select{|||execution configuration }0arguments to "
+  "too many %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
   "expected at most %1, have %2">;
 def err_typecheck_call_too_many_args_at_most_one : Error<
   "too many %select{|||execution configuration }0arguments to "
   "%select{function|block|method|kernel function}0 call, "
-  "expected at most single argument %1, have %2 arguments">;
+  "expected at most single %select{|non-object }3argument %1, "
+  "have %2%select{|non-object}3 arguments">;
 def err_typecheck_call_too_many_args_suggest : Error<
-  "too many %select{|||execution configuration }0arguments to "
+  "too many %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
-  "expected %1, have %2; did you mean %3?">;
+  "expected %1, have %2; did you mean %4?">;
 def err_typecheck_call_too_many_args_at_most_suggest : Error<
-  "too many %select{|||execution configuration }0arguments to "
+  "too many %select{|||execution configuration }0"
+  "%select{|non-object }3arguments to "
   "%select{function|block|method|kernel function}0 call, "
-  "expected at most %1, have %2; did you mean %3?">;
+  "expected at most %1, have %2; did you mean %4?">;
 
 def err_arc_typecheck_convert_incompatible_pointer : Error<
   "incompatible pointer types passing retainable parameter of type %0"
@@ -9389,10 +9456,10 @@ def warn_cxx98_compat_explicit_conversion_functions : Warning<
 
 // C++11 defaulted functions
 def err_defaulted_special_member_params : Error<
-  "an explicitly-defaulted %select{|copy |move }0constructor cannot "
+  "an explicitly-defaulted %sub{select_special_member_kind}0 cannot "
   "have default arguments">;
 def err_defaulted_special_member_variadic : Error<
-  "an explicitly-defaulted %select{|copy |move }0constructor cannot "
+  "an explicitly-defaulted %sub{select_special_member_kind}0 cannot "
   "be variadic">;
 def err_defaulted_special_member_return_type : Error<
   "explicitly-defaulted %select{copy|move}0 assignment operator must "
@@ -9416,6 +9483,8 @@ def err_defaulted_copy_assign_not_ref : Error<
 def err_incorrect_defaulted_constexpr : Error<
   "defaulted definition of %sub{select_special_member_kind}0 "
   "is not constexpr">;
+def err_incorrect_defaulted_constexpr_with_vb: Error<
+  "%sub{select_special_member_kind}0 cannot be 'constexpr' in a class with virtual base class">;
 def err_incorrect_defaulted_consteval : Error<
   "defaulted declaration of %sub{select_special_member_kind}0 "
   "cannot be consteval because implicit definition is not constexpr">;
@@ -9455,7 +9524,7 @@ def err_defaulted_comparison_template : Error<
   "comparison operator template cannot be defaulted">;
 def err_defaulted_comparison_num_args : Error<
   "%select{non-member|member}0 %sub{select_defaulted_comparison_kind}1"
-  " comparison operator must have %select{2|1}0 parameters">;
+  " must have %select{2|1}0 parameters">;
 def err_defaulted_comparison_param : Error<
   "invalid parameter type for defaulted %sub{select_defaulted_comparison_kind}0"
   "; found %1, expected %2%select{| or %4}3">;
diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h
index c377aba2c1d2eb..23237a9326f84b 100644
--- a/clang/include/clang/Basic/FileEntry.h
+++ b/clang/include/clang/Basic/FileEntry.h
@@ -426,7 +426,6 @@ class FileEntry {
 public:
   ~FileEntry();
   StringRef getName() const { return LastRef->getName(); }
-  FileEntryRef getLastRef() const { return *LastRef; }
 
   StringRef tryGetRealPathName() const { return RealPathName; }
   off_t getSize() const { return Size; }
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 28c9bcec3ee60f..c0ea4ecb9806a5 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -280,6 +280,8 @@ ENUM_LANGOPT(SYCLVersion  , SYCLMajorVersion, 2, SYCL_None, "Version of the SYCL
 
 LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
 LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)")
+LANGOPT(HIPStdPar, 1, 0, "Enable Standard Parallel Algorithm Acceleration for HIP (experimental)")
+LANGOPT(HIPStdParInterposeAlloc, 1, 0, "Replace allocations / deallocations with HIP RT calls when Standard Parallel Algorithm Acceleration for HIP is enabled (Experimental)")
 
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
diff --git a/clang/include/clang/Basic/ObjCRuntime.h b/clang/include/clang/Basic/ObjCRuntime.h
index 0f714ed3ad603a..d783154c3f9f13 100644
--- a/clang/include/clang/Basic/ObjCRuntime.h
+++ b/clang/include/clang/Basic/ObjCRuntime.h
@@ -483,7 +483,7 @@ class ObjCRuntime {
   }
 
   template <typename HasherT, llvm::support::endianness Endianness>
-  friend void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  friend void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                       const ObjCRuntime &OCR) {
     HBuilder.add(OCR.getKind(), OCR.getVersion());
   }
diff --git a/clang/include/clang/Basic/Sanitizers.h b/clang/include/clang/Basic/Sanitizers.h
index 4659e45c788341..090a3a7fa90762 100644
--- a/clang/include/clang/Basic/Sanitizers.h
+++ b/clang/include/clang/Basic/Sanitizers.h
@@ -78,7 +78,7 @@ class SanitizerMask {
   llvm::hash_code hash_value() const;
 
   template <typename HasherT, llvm::support::endianness Endianness>
-  friend void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  friend void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                       const SanitizerMask &SM) {
     HBuilder.addRange(&SM.maskLoToHigh[0], &SM.maskLoToHigh[kNumElem]);
   }
diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index c1b24eec2759c7..08a69e08405dfb 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -877,13 +877,6 @@ class SourceManager : public RefCountedBase<SourceManager> {
 
   /// Create a new FileID that represents the specified file
   /// being \#included from the specified IncludePosition.
-  ///
-  /// This translates NULL into standard input.
-  FileID createFileID(const FileEntry *SourceFile, SourceLocation IncludePos,
-                      SrcMgr::CharacteristicKind FileCharacter,
-                      int LoadedID = 0,
-                      SourceLocation::UIntTy LoadedOffset = 0);
-
   FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos,
                       SrcMgr::CharacteristicKind FileCharacter,
                       int LoadedID = 0,
@@ -909,7 +902,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
 
   /// Get the FileID for \p SourceFile if it exists. Otherwise, create a
   /// new FileID for the \p SourceFile.
-  FileID getOrCreateFileID(const FileEntry *SourceFile,
+  FileID getOrCreateFileID(FileEntryRef SourceFile,
                            SrcMgr::CharacteristicKind FileCharacter);
 
   /// Creates an expansion SLocEntry for the substitution of an argument into a
diff --git a/clang/include/clang/Driver/OffloadBundler.h b/clang/include/clang/Driver/OffloadBundler.h
index 28473c53662de2..17df31d31071d9 100644
--- a/clang/include/clang/Driver/OffloadBundler.h
+++ b/clang/include/clang/Driver/OffloadBundler.h
@@ -19,6 +19,7 @@
 
 #include "llvm/Support/Error.h"
 #include "llvm/TargetParser/Triple.h"
+#include <llvm/Support/MemoryBuffer.h>
 #include <string>
 #include <vector>
 
@@ -26,11 +27,15 @@ namespace clang {
 
 class OffloadBundlerConfig {
 public:
+  OffloadBundlerConfig();
+
   bool AllowNoHost = false;
   bool AllowMissingBundles = false;
   bool CheckInputArchive = false;
   bool PrintExternalCommands = false;
   bool HipOpenmpCompatible = false;
+  bool Compress = false;
+  bool Verbose = false;
 
   unsigned BundleAlignment = 1;
   unsigned HostInputIndex = ~0u;
@@ -84,6 +89,38 @@ struct OffloadTargetInfo {
   std::string str() const;
 };
 
+// CompressedOffloadBundle represents the format for the compressed offload
+// bundles.
+//
+// The format is as follows:
+// - Magic Number (4 bytes) - A constant "CCOB".
+// - Version (2 bytes)
+// - Compression Method (2 bytes) - Uses the values from
+// llvm::compression::Format.
+// - Uncompressed Size (4 bytes).
+// - Truncated MD5 Hash (8 bytes).
+// - Compressed Data (variable length).
+
+class CompressedOffloadBundle {
+private:
+  static inline const size_t MagicSize = 4;
+  static inline const size_t VersionFieldSize = sizeof(uint16_t);
+  static inline const size_t MethodFieldSize = sizeof(uint16_t);
+  static inline const size_t SizeFieldSize = sizeof(uint32_t);
+  static inline const size_t HashFieldSize = 8;
+  static inline const size_t HeaderSize = MagicSize + VersionFieldSize +
+                                          MethodFieldSize + SizeFieldSize +
+                                          HashFieldSize;
+  static inline const llvm::StringRef MagicNumber = "CCOB";
+  static inline const uint16_t Version = 1;
+
+public:
+  static llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
+  compress(const llvm::MemoryBuffer &Input, bool Verbose = false);
+  static llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
+  decompress(const llvm::MemoryBuffer &Input, bool Verbose = false);
+};
+
 } // namespace clang
 
 #endif // LLVM_CLANG_DRIVER_OFFLOADBUNDLER_H
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index efd90942948af2..381126be176ac4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1183,6 +1183,10 @@ def fgpu_inline_threshold_EQ : Joined<["-"], "fgpu-inline-threshold=">,
 def fgpu_sanitize : Flag<["-"], "fgpu-sanitize">, Group<f_Group>,
   HelpText<"Enable sanitizer for supported offloading devices">;
 def fno_gpu_sanitize : Flag<["-"], "fno-gpu-sanitize">, Group<f_Group>;
+
+def offload_compress : Flag<["--"], "offload-compress">,
+  HelpText<"Compress offload device binaries (HIP only)">;
+def no_offload_compress : Flag<["--"], "no-offload-compress">;
 }
 
 // CUDA options
@@ -1258,6 +1262,32 @@ def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<hip_Group>,
   HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
 def hip_path_EQ : Joined<["--"], "hip-path=">, Group<hip_Group>,
   HelpText<"HIP runtime installation path, used for finding HIP version and adding HIP include path.">;
+def hipstdpar : Flag<["--"], "hipstdpar">,
+  Visibility<[ClangOption, CC1Option]>,
+  Group<CompileOnly_Group>,
+  HelpText<"Enable HIP acceleration for standard parallel algorithms">,
+  MarshallingInfoFlag<LangOpts<"HIPStdPar">>;
+def hipstdpar_interpose_alloc : Flag<["--"], "hipstdpar-interpose-alloc">,
+  Visibility<[ClangOption, CC1Option]>,
+  Group<CompileOnly_Group>,
+  HelpText<"Replace all memory allocation / deallocation calls with "
+           "hipManagedMalloc / hipFree equivalents">,
+  MarshallingInfoFlag<LangOpts<"HIPStdParInterposeAlloc">>;
+// TODO: use MarshallingInfo here
+def hipstdpar_path_EQ : Joined<["--"], "hipstdpar-path=">, Group<i_Group>,
+  HelpText<
+    "HIP Standard Parallel Algorithm Acceleration library path, used for "
+    "finding and implicitly including the library header">;
+def hipstdpar_thrust_path_EQ : Joined<["--"], "hipstdpar-thrust-path=">,
+  Group<i_Group>,
+  HelpText<
+    "rocThrust path, required by the HIP Standard Parallel Algorithm "
+    "Acceleration library, used to implicitly include the rocThrust library">;
+def hipstdpar_prim_path_EQ : Joined<["--"], "hipstdpar-prim-path=">,
+  Group<i_Group>,
+  HelpText<
+    "rocPrim path, required by the HIP Standard Parallel Algorithm "
+    "Acceleration library, used to implicitly include the rocPrim library">;
 def rocm_device_lib_path_EQ : Joined<["--"], "rocm-device-lib-path=">, Group<hip_Group>,
   HelpText<"ROCm device library path. Alternative to rocm-path.">;
 def : Joined<["--"], "hip-device-lib-path=">, Alias<rocm_device_lib_path_EQ>;
@@ -1879,6 +1909,12 @@ defm safe_buffer_usage_suggestions : BoolFOption<"safe-buffer-usage-suggestions"
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
           "Display suggestions to update code associated with -Wunsafe-buffer-usage warnings">,
   NegFlag<SetFalse>>;
+def fverify_intermediate_code : Flag<["-"], "fverify-intermediate-code">,
+  Group<f_clang_Group>, Visibility<[ClangOption, CLOption, DXCOption]>,
+  HelpText<"Enable verification of LLVM IR">, Flags<[NoXarchOption]>;
+def fno_verify_intermediate_code : Flag<["-"], "fno-verify-intermediate-code">,
+  Group<f_clang_Group>, Visibility<[ClangOption, CLOption, DXCOption]>,
+  HelpText<"Disable verification of LLVM IR">, Flags<[NoXarchOption]>;
 def fdiscard_value_names : Flag<["-"], "fdiscard-value-names">,
   Group<f_clang_Group>, Visibility<[ClangOption, DXCOption]>,
   HelpText<"Discard value names in LLVM IR">, Flags<[NoXarchOption]>;
@@ -4257,7 +4293,7 @@ def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>,
   MarshallingInfoString<TargetOpts<"CodeModel">, [{"default"}]>;
 def mlarge_data_threshold_EQ : Joined<["-"], "mlarge-data-threshold=">, Group<m_Group>,
   Visibility<[ClangOption, CC1Option]>,
-  MarshallingInfoInt<TargetOpts<"LargeDataThreshold">>;
+  MarshallingInfoInt<TargetOpts<"LargeDataThreshold">, "65535">;
 def mtls_size_EQ : Joined<["-"], "mtls-size=">, Group<m_Group>,
   Flags<[NoXarchOption]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Specify bit size of immediate TLS offsets (AArch64 ELF only): "
@@ -4557,18 +4593,17 @@ foreach i = {8-15,18} in
     HelpText<"Make the x"#i#" register call-saved (AArch64 only)">;
 
 def msve_vector_bits_EQ : Joined<["-"], "msve-vector-bits=">, Group<m_aarch64_Features_Group>,
+  Visibility<[ClangOption, FlangOption]>,
   HelpText<"Specify the size in bits of an SVE vector register. Defaults to the"
            " vector length agnostic value of \"scalable\". (AArch64 only)">;
 } // let Flags = [TargetSpecific]
 
 def mvscale_min_EQ : Joined<["-"], "mvscale-min=">,
-  Group<m_aarch64_Features_Group>, Flags<[NoXarchOption]>,
-  Visibility<[ClangOption, CC1Option]>,
+  Visibility<[CC1Option, FC1Option]>,
   HelpText<"Specify the vscale minimum. Defaults to \"1\". (AArch64/RISC-V only)">,
   MarshallingInfoInt<LangOpts<"VScaleMin">>;
 def mvscale_max_EQ : Joined<["-"], "mvscale-max=">,
-  Group<m_aarch64_Features_Group>, Flags<[NoXarchOption]>,
-  Visibility<[ClangOption, CC1Option]>,
+  Visibility<[CC1Option, FC1Option]>,
   HelpText<"Specify the vscale maximum. Defaults to the"
            " vector length agnostic value of \"0\". (AArch64/RISC-V only)">,
   MarshallingInfoInt<LangOpts<"VScaleMax">>;
diff --git a/clang/include/clang/Lex/HeaderSearchOptions.h b/clang/include/clang/Lex/HeaderSearchOptions.h
index 126659f3ac0022..206bc69d7b2cdc 100644
--- a/clang/include/clang/Lex/HeaderSearchOptions.h
+++ b/clang/include/clang/Lex/HeaderSearchOptions.h
@@ -268,7 +268,7 @@ inline llvm::hash_code hash_value(const HeaderSearchOptions::Entry &E) {
 }
 
 template <typename HasherT, llvm::support::endianness Endianness>
-inline void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+inline void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                     const HeaderSearchOptions::Entry &E) {
   HBuilder.add(E.Path, E.Group, E.IsFramework, E.IgnoreSysRoot);
 }
@@ -279,7 +279,7 @@ hash_value(const HeaderSearchOptions::SystemHeaderPrefix &SHP) {
 }
 
 template <typename HasherT, llvm::support::endianness Endianness>
-inline void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+inline void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                     const HeaderSearchOptions::SystemHeaderPrefix &SHP) {
   HBuilder.add(SHP.Prefix, SHP.IsSystemHeader);
 }
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index e88164f196c1f7..a8d2599d94ebc6 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -1722,6 +1722,9 @@ class Preprocessor {
   /// Lex the next token for this preprocessor.
   void Lex(Token &Result);
 
+  /// Lex all tokens for this preprocessor until (and excluding) end of file.
+  void LexTokensUntilEOF(std::vector<Token> *Tokens = nullptr);
+
   /// Lex a token, forming a header-name token if possible.
   bool LexHeaderName(Token &Result, bool AllowMacroExpansion = true);
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 0e969f341bbe19..79ac622fd03e24 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1766,6 +1766,7 @@ class Parser : public CodeCompletionHandler {
   ExprResult ParseConstantExpressionInExprEvalContext(
       TypeCastState isTypeCast = NotTypeCast);
   ExprResult ParseConstantExpression();
+  ExprResult ParseArrayBoundExpression();
   ExprResult ParseCaseExpression(SourceLocation CaseLoc);
   ExprResult ParseConstraintExpression();
   ExprResult
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index ff5e2e2e57366d..4561cca929c0d0 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -2674,6 +2674,8 @@ class Declarator {
   /// redeclaration time if the decl is static.
   bool isStaticMember();
 
+  bool isExplicitObjectMemberFunction();
+
   /// Returns true if this declares a constructor or a destructor.
   bool isCtorOrDtor();
 
diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index 26c0387dfc444c..361108fb51ba0e 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -847,6 +847,8 @@ class LambdaScopeInfo final :
   /// is known.
   bool AfterParameterList = true;
 
+  ParmVarDecl *ExplicitObjectParameter = nullptr;
+
   /// Source range covering the lambda introducer [...].
   SourceRange IntroducerRange;
 
@@ -1042,6 +1044,8 @@ class LambdaScopeInfo final :
 
   void visitPotentialCaptures(
       llvm::function_ref<void(ValueDecl *, Expr *)> Callback) const;
+
+  bool lambdaCaptureShouldBeConst() const;
 };
 
 FunctionScopeInfo::WeakObjectProfileTy::WeakObjectProfileTy()
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index e13524b5f3b30c..f77e109bac323e 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1062,20 +1062,6 @@ class Sema final {
     }
   };
 
-  /// Whether the AST is currently being rebuilt to correct immediate
-  /// invocations. Immediate invocation candidates and references to consteval
-  /// functions aren't tracked when this is set.
-  bool RebuildingImmediateInvocation = false;
-
-  /// Used to change context to isConstantEvaluated without pushing a heavy
-  /// ExpressionEvaluationContextRecord object.
-  bool isConstantEvaluatedOverride;
-
-  bool isConstantEvaluated() const {
-    return ExprEvalContexts.back().isConstantEvaluated() ||
-           isConstantEvaluatedOverride;
-  }
-
   /// RAII object to handle the state changes required to synthesize
   /// a function body.
   class SynthesizedFunctionScope {
@@ -1361,6 +1347,11 @@ class Sema final {
 
     bool IsCurrentlyCheckingDefaultArgumentOrInitializer = false;
 
+    // We are in a constant context, but we also allow
+    // non constant expressions, for example for array bounds (which may be
+    // VLAs).
+    bool InConditionallyConstantEvaluateContext = false;
+
     // When evaluating immediate functions in the initializer of a default
     // argument or default member initializer, this is the declaration whose
     // default initializer is being evaluated and the location of the call
@@ -2183,20 +2174,17 @@ class Sema final {
       const FunctionProtoType *Old, SourceLocation OldLoc,
       const FunctionProtoType *New, SourceLocation NewLoc);
   bool handlerCanCatch(QualType HandlerType, QualType ExceptionType);
-  bool CheckExceptionSpecSubset(const PartialDiagnostic &DiagID,
-                                const PartialDiagnostic &NestedDiagID,
-                                const PartialDiagnostic &NoteID,
-                                const PartialDiagnostic &NoThrowDiagID,
-                                const FunctionProtoType *Superset,
-                                SourceLocation SuperLoc,
-                                const FunctionProtoType *Subset,
-                                SourceLocation SubLoc);
-  bool CheckParamExceptionSpec(const PartialDiagnostic &NestedDiagID,
-                               const PartialDiagnostic &NoteID,
-                               const FunctionProtoType *Target,
-                               SourceLocation TargetLoc,
-                               const FunctionProtoType *Source,
-                               SourceLocation SourceLoc);
+  bool CheckExceptionSpecSubset(
+      const PartialDiagnostic &DiagID, const PartialDiagnostic &NestedDiagID,
+      const PartialDiagnostic &NoteID, const PartialDiagnostic &NoThrowDiagID,
+      const FunctionProtoType *Superset, bool SkipSupersetFirstParameter,
+      SourceLocation SuperLoc, const FunctionProtoType *Subset,
+      bool SkipSubsetFirstParameter, SourceLocation SubLoc);
+  bool CheckParamExceptionSpec(
+      const PartialDiagnostic &NestedDiagID, const PartialDiagnostic &NoteID,
+      const FunctionProtoType *Target, bool SkipTargetFirstParameter,
+      SourceLocation TargetLoc, const FunctionProtoType *Source,
+      bool SkipSourceFirstParameter, SourceLocation SourceLoc);
 
   TypeResult ActOnTypeName(Scope *S, Declarator &D);
 
@@ -3023,7 +3011,8 @@ class Sema final {
   Attr *getImplicitCodeSegOrSectionAttrForFunction(const FunctionDecl *FD,
                                                    bool IsDefinition);
   void CheckFunctionOrTemplateParamDeclarator(Scope *S, Declarator &D);
-  Decl *ActOnParamDeclarator(Scope *S, Declarator &D);
+  Decl *ActOnParamDeclarator(Scope *S, Declarator &D,
+                             SourceLocation ExplicitThisLoc = {});
   ParmVarDecl *BuildParmVarDeclForTypedef(DeclContext *DC,
                                           SourceLocation Loc,
                                           QualType T);
@@ -3797,8 +3786,12 @@ class Sema final {
                              NamedDecl *&OldDecl,
                              bool UseMemberUsingDeclRules);
   bool IsOverload(FunctionDecl *New, FunctionDecl *Old,
-                  bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs = true,
-                  bool ConsiderRequiresClauses = true);
+                  bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs = true);
+
+  // Checks whether MD constitutes an override the base class method BaseMD.
+  // When checking for overrides, the object object members are ignored.
+  bool IsOverride(FunctionDecl *MD, FunctionDecl *BaseMD,
+                  bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs = true);
 
   // Calculates whether the expression Constraint depends on an enclosing
   // template, for the purposes of [temp.friend] p9.
@@ -3856,6 +3849,12 @@ class Sema final {
                                  QualType &ConvertedType);
   bool IsBlockPointerConversion(QualType FromType, QualType ToType,
                                 QualType& ConvertedType);
+
+  bool FunctionParamTypesAreEqual(ArrayRef<QualType> Old,
+                                  ArrayRef<QualType> New,
+                                  unsigned *ArgPos = nullptr,
+                                  bool Reversed = false);
+
   bool FunctionParamTypesAreEqual(const FunctionProtoType *OldType,
                                   const FunctionProtoType *NewType,
                                   unsigned *ArgPos = nullptr,
@@ -3896,10 +3895,11 @@ class Sema final {
                                        ExprResult Init,
                                        bool TopLevelOfInitList = false,
                                        bool AllowExplicit = false);
-  ExprResult PerformObjectArgumentInitialization(Expr *From,
-                                                 NestedNameSpecifier *Qualifier,
-                                                 NamedDecl *FoundDecl,
-                                                 CXXMethodDecl *Method);
+  ExprResult InitializeExplicitObjectArgument(Sema &S, Expr *Obj,
+                                              FunctionDecl *Fun);
+  ExprResult PerformImplicitObjectArgumentInitialization(
+      Expr *From, NestedNameSpecifier *Qualifier, NamedDecl *FoundDecl,
+      CXXMethodDecl *Method);
 
   /// Check that the lifetime of the initializer (and its subobjects) is
   /// sufficient for initializing the entity, and perform lifetime extension
@@ -4221,9 +4221,8 @@ class Sema final {
       QualType DestTypeForComplaining = QualType(),
       unsigned DiagIDForComplaining = 0);
 
-  Expr *FixOverloadedFunctionReference(Expr *E,
-                                       DeclAccessPair FoundDecl,
-                                       FunctionDecl *Fn);
+  ExprResult FixOverloadedFunctionReference(Expr *E, DeclAccessPair FoundDecl,
+                                            FunctionDecl *Fn);
   ExprResult FixOverloadedFunctionReference(ExprResult,
                                             DeclAccessPair FoundDecl,
                                             FunctionDecl *Fn);
@@ -4796,11 +4795,13 @@ class Sema final {
   bool CheckAlwaysInlineAttr(const Stmt *OrigSt, const Stmt *CurSt,
                              const AttributeCommonInfo &A);
 
+  bool CheckCountedByAttr(Scope *Scope, const FieldDecl *FD);
+
   /// Adjust the calling convention of a method to be the ABI default if it
   /// wasn't specified explicitly.  This handles method types formed from
   /// function type typedefs and typename template arguments.
-  void adjustMemberFunctionCC(QualType &T, bool IsStatic, bool IsCtorOrDtor,
-                              SourceLocation Loc);
+  void adjustMemberFunctionCC(QualType &T, bool HasThisPointer,
+                              bool IsCtorOrDtor, SourceLocation Loc);
 
   // Check if there is an explicit attribute, but only look through parens.
   // The intent is to look for an attribute on the current declarator, but not
@@ -5796,6 +5797,10 @@ class Sema final {
                           Expr *Input, bool IsAfterAmp = false);
 
   bool isQualifiedMemberAccess(Expr *E);
+  bool CheckUseOfCXXMethodAsAddressOfOperand(SourceLocation OpLoc,
+                                             const Expr *Op,
+                                             const CXXMethodDecl *MD);
+
   QualType CheckAddressOfOperand(ExprResult &Operand, SourceLocation OpLoc);
 
   bool CheckTypeTraitArity(unsigned Arity, SourceLocation Loc, size_t N);
@@ -7236,6 +7241,8 @@ class Sema final {
       StorageClass SC, ArrayRef<ParmVarDecl *> Params,
       bool HasExplicitResultType);
 
+  void DiagnoseInvalidExplicitObjectParameterInLambda(CXXMethodDecl *Method);
+
   /// Perform initialization analysis of the init-capture and perform
   /// any implicit conversions such as an lvalue-to-rvalue conversion if
   /// not being used to initialize a reference.
@@ -7369,7 +7376,8 @@ class Sema final {
   public:
     LambdaScopeForCallOperatorInstantiationRAII(
         Sema &SemasRef, FunctionDecl *FD, MultiLevelTemplateArgumentList MLTAL,
-        LocalInstantiationScope &Scope);
+        LocalInstantiationScope &Scope,
+        bool ShouldAddDeclsFromParentScope = true);
   };
 
   /// Check whether the given expression is a valid constraint expression.
@@ -7395,6 +7403,11 @@ class Sema final {
   llvm::ContextualFoldingSet<ConstraintSatisfaction, const ASTContext &>
       SatisfactionCache;
 
+  /// Introduce the instantiated local variables into the local
+  /// instantiation scope.
+  void addInstantiatedLocalVarsToScope(FunctionDecl *Function,
+                                       const FunctionDecl *PatternDecl,
+                                       LocalInstantiationScope &Scope);
   /// Introduce the instantiated function parameters into the local
   /// instantiation scope, and set the parameter names to those used
   /// in the template.
@@ -7916,6 +7929,13 @@ class Sema final {
   void DefineDefaultedComparison(SourceLocation Loc, FunctionDecl *FD,
                                  DefaultedComparisonKind DCK);
 
+  void CheckExplicitObjectMemberFunction(Declarator &D, DeclarationName Name,
+                                         QualType R, bool IsLambda,
+                                         DeclContext *DC = nullptr);
+  void CheckExplicitObjectMemberFunction(DeclContext *DC, Declarator &D,
+                                         DeclarationName Name, QualType R);
+  void CheckExplicitObjectLambda(Declarator &D);
+
   //===--------------------------------------------------------------------===//
   // C++ Derived Classes
   //
@@ -7967,6 +7987,10 @@ class Sema final {
   bool CheckOverridingFunctionReturnType(const CXXMethodDecl *New,
                                          const CXXMethodDecl *Old);
 
+  // Check that the overriding method has no explicit object parameter.
+  bool CheckExplicitObjectOverride(CXXMethodDecl *New,
+                                   const CXXMethodDecl *Old);
+
   /// CheckOverridingFunctionExceptionSpec - Checks whether the exception
   /// spec is a subset of base spec.
   bool CheckOverridingFunctionExceptionSpec(const CXXMethodDecl *New,
@@ -9210,6 +9234,7 @@ class Sema final {
       TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
       FunctionDecl *&Specialization, sema::TemplateDeductionInfo &Info,
       bool PartialOverloading, bool AggregateDeductionCandidate,
+      QualType ObjectType, Expr::Classification ObjectClassification,
       llvm::function_ref<bool(ArrayRef<QualType>)> CheckNonDependent);
 
   TemplateDeductionResult
@@ -9220,11 +9245,10 @@ class Sema final {
                           sema::TemplateDeductionInfo &Info,
                           bool IsAddressOfFunction = false);
 
-  TemplateDeductionResult
-  DeduceTemplateArguments(FunctionTemplateDecl *FunctionTemplate,
-                          QualType ToType,
-                          CXXConversionDecl *&Specialization,
-                          sema::TemplateDeductionInfo &Info);
+  TemplateDeductionResult DeduceTemplateArguments(
+      FunctionTemplateDecl *FunctionTemplate, QualType ObjectType,
+      Expr::Classification ObjectClassification, QualType ToType,
+      CXXConversionDecl *&Specialization, sema::TemplateDeductionInfo &Info);
 
   TemplateDeductionResult
   DeduceTemplateArguments(FunctionTemplateDecl *FunctionTemplate,
@@ -9845,30 +9869,44 @@ class Sema final {
   /// diagnostics that will be suppressed.
   std::optional<sema::TemplateDeductionInfo *> isSFINAEContext() const;
 
-  /// Determines whether we are currently in a context that
-  /// is not evaluated as per C++ [expr] p5.
-  bool isUnevaluatedContext() const {
+  /// Whether the AST is currently being rebuilt to correct immediate
+  /// invocations. Immediate invocation candidates and references to consteval
+  /// functions aren't tracked when this is set.
+  bool RebuildingImmediateInvocation = false;
+
+  /// Used to change context to isConstantEvaluated without pushing a heavy
+  /// ExpressionEvaluationContextRecord object.
+  bool isConstantEvaluatedOverride = false;
+
+  const ExpressionEvaluationContextRecord &currentEvaluationContext() const {
     assert(!ExprEvalContexts.empty() &&
            "Must be in an expression evaluation context");
-    return ExprEvalContexts.back().isUnevaluated();
-  }
+    return ExprEvalContexts.back();
+  };
 
   bool isConstantEvaluatedContext() const {
-    assert(!ExprEvalContexts.empty() &&
-           "Must be in an expression evaluation context");
-    return ExprEvalContexts.back().isConstantEvaluated();
+    return currentEvaluationContext().isConstantEvaluated() ||
+           isConstantEvaluatedOverride;
+  }
+
+  bool isAlwaysConstantEvaluatedContext() const {
+    const ExpressionEvaluationContextRecord &Ctx = currentEvaluationContext();
+    return (Ctx.isConstantEvaluated() || isConstantEvaluatedOverride) &&
+           !Ctx.InConditionallyConstantEvaluateContext;
+  }
+
+  /// Determines whether we are currently in a context that
+  /// is not evaluated as per C++ [expr] p5.
+  bool isUnevaluatedContext() const {
+    return currentEvaluationContext().isUnevaluated();
   }
 
   bool isImmediateFunctionContext() const {
-    assert(!ExprEvalContexts.empty() &&
-           "Must be in an expression evaluation context");
-    return ExprEvalContexts.back().isImmediateFunctionContext();
+    return currentEvaluationContext().isImmediateFunctionContext();
   }
 
   bool isCheckingDefaultArgumentOrInitializer() const {
-    assert(!ExprEvalContexts.empty() &&
-           "Must be in an expression evaluation context");
-    const ExpressionEvaluationContextRecord &Ctx = ExprEvalContexts.back();
+    const ExpressionEvaluationContextRecord &Ctx = currentEvaluationContext();
     return (Ctx.Context ==
             ExpressionEvaluationContext::PotentiallyEvaluatedIfUsed) ||
            Ctx.IsCurrentlyCheckingDefaultArgumentOrInitializer;
@@ -12448,6 +12486,10 @@ class Sema final {
                                          SourceLocation LParenLoc,
                                          SourceLocation EndLoc);
 
+  /// Called on a well-formed 'ompx_bare' clause.
+  OMPClause *ActOnOpenMPXBareClause(SourceLocation StartLoc,
+                                    SourceLocation EndLoc);
+
   /// The kind of conversion being performed.
   enum CheckedConversionKind {
     /// An implicit conversion.
diff --git a/clang/include/clang/Serialization/ModuleFileExtension.h b/clang/include/clang/Serialization/ModuleFileExtension.h
index 2168ce2ce607c0..d7d456c8b5db8e 100644
--- a/clang/include/clang/Serialization/ModuleFileExtension.h
+++ b/clang/include/clang/Serialization/ModuleFileExtension.h
@@ -86,8 +86,7 @@ class ModuleFileExtension
   /// The default implementation of this function simply does nothing, so the
   /// presence/absence of this extension does not distinguish module files.
   using ExtensionHashBuilder =
-      llvm::HashBuilderImpl<llvm::MD5,
-                            llvm::support::endian::system_endianness()>;
+      llvm::HashBuilder<llvm::MD5, llvm::endianness::native>;
   virtual void hashExtension(ExtensionHashBuilder &HBuilder) const;
 
   /// Create a new module file extension writer, which will be
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 8917662c533682..cdc3d62bca0087 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -925,10 +925,14 @@ ASTContext::ASTContext(LangOptions &LOpts, SourceManager &SM,
                        IdentifierTable &idents, SelectorTable &sels,
                        Builtin::Context &builtins, TranslationUnitKind TUKind)
     : ConstantArrayTypes(this_(), ConstantArrayTypesLog2InitSize),
+      DependentSizedArrayTypes(this_()), DependentSizedExtVectorTypes(this_()),
+      DependentAddressSpaceTypes(this_()), DependentVectorTypes(this_()),
+      DependentSizedMatrixTypes(this_()),
       FunctionProtoTypes(this_(), FunctionProtoTypesLog2InitSize),
+      DependentTypeOfExprTypes(this_()), DependentDecltypeTypes(this_()),
       TemplateSpecializationTypes(this_()),
       DependentTemplateSpecializationTypes(this_()), AutoTypes(this_()),
-      SubstTemplateTemplateParmPacks(this_()),
+      DependentBitIntTypes(this_()), SubstTemplateTemplateParmPacks(this_()),
       CanonTemplateTemplateParms(this_()), SourceMgr(SM), LangOpts(LOpts),
       NoSanitizeL(new NoSanitizeList(LangOpts.NoSanitizeFiles, SM)),
       XRayFilter(new XRayFunctionFilter(LangOpts.XRayAlwaysInstrumentFiles,
@@ -3786,11 +3790,8 @@ QualType ASTContext::getDependentSizedArrayType(QualType elementType,
   // initializer.  We do no canonicalization here at all, which is okay
   // because they can't be used in most locations.
   if (!numElements) {
-    auto *newType
-      = new (*this, TypeAlignment)
-          DependentSizedArrayType(*this, elementType, QualType(),
-                                  numElements, ASM, elementTypeQuals,
-                                  brackets);
+    auto *newType = new (*this, TypeAlignment) DependentSizedArrayType(
+        elementType, QualType(), numElements, ASM, elementTypeQuals, brackets);
     Types.push_back(newType);
     return QualType(newType, 0);
   }
@@ -3813,9 +3814,8 @@ QualType ASTContext::getDependentSizedArrayType(QualType elementType,
   // If we don't have one, build one.
   if (!canonTy) {
     canonTy = new (*this, TypeAlignment)
-      DependentSizedArrayType(*this, QualType(canonElementType.Ty, 0),
-                              QualType(), numElements, ASM, elementTypeQuals,
-                              brackets);
+        DependentSizedArrayType(QualType(canonElementType.Ty, 0), QualType(),
+                                numElements, ASM, elementTypeQuals, brackets);
     DependentSizedArrayTypes.InsertNode(canonTy, insertPos);
     Types.push_back(canonTy);
   }
@@ -3832,10 +3832,8 @@ QualType ASTContext::getDependentSizedArrayType(QualType elementType,
 
   // Otherwise, we need to build a type which follows the spelling
   // of the element type.
-  auto *sugaredType
-    = new (*this, TypeAlignment)
-        DependentSizedArrayType(*this, elementType, canon, numElements,
-                                ASM, elementTypeQuals, brackets);
+  auto *sugaredType = new (*this, TypeAlignment) DependentSizedArrayType(
+      elementType, canon, numElements, ASM, elementTypeQuals, brackets);
   Types.push_back(sugaredType);
   return QualType(sugaredType, 0);
 }
@@ -4111,12 +4109,12 @@ ASTContext::getDependentVectorType(QualType VecType, Expr *SizeExpr,
 
   if (Canon) {
     New = new (*this, TypeAlignment) DependentVectorType(
-        *this, VecType, QualType(Canon, 0), SizeExpr, AttrLoc, VecKind);
+        VecType, QualType(Canon, 0), SizeExpr, AttrLoc, VecKind);
   } else {
     QualType CanonVecTy = getCanonicalType(VecType);
     if (CanonVecTy == VecType) {
-      New = new (*this, TypeAlignment) DependentVectorType(
-          *this, VecType, QualType(), SizeExpr, AttrLoc, VecKind);
+      New = new (*this, TypeAlignment)
+          DependentVectorType(VecType, QualType(), SizeExpr, AttrLoc, VecKind);
 
       DependentVectorType *CanonCheck =
           DependentVectorTypes.FindNodeOrInsertPos(ID, InsertPos);
@@ -4127,8 +4125,8 @@ ASTContext::getDependentVectorType(QualType VecType, Expr *SizeExpr,
     } else {
       QualType CanonTy = getDependentVectorType(CanonVecTy, SizeExpr,
                                                 SourceLocation(), VecKind);
-      New = new (*this, TypeAlignment) DependentVectorType(
-          *this, VecType, CanonTy, SizeExpr, AttrLoc, VecKind);
+      New = new (*this, TypeAlignment)
+          DependentVectorType(VecType, CanonTy, SizeExpr, AttrLoc, VecKind);
     }
   }
 
@@ -4186,15 +4184,13 @@ ASTContext::getDependentSizedExtVectorType(QualType vecType,
   if (Canon) {
     // We already have a canonical version of this array type; use it as
     // the canonical type for a newly-built type.
-    New = new (*this, TypeAlignment)
-      DependentSizedExtVectorType(*this, vecType, QualType(Canon, 0),
-                                  SizeExpr, AttrLoc);
+    New = new (*this, TypeAlignment) DependentSizedExtVectorType(
+        vecType, QualType(Canon, 0), SizeExpr, AttrLoc);
   } else {
     QualType CanonVecTy = getCanonicalType(vecType);
     if (CanonVecTy == vecType) {
       New = new (*this, TypeAlignment)
-        DependentSizedExtVectorType(*this, vecType, QualType(), SizeExpr,
-                                    AttrLoc);
+          DependentSizedExtVectorType(vecType, QualType(), SizeExpr, AttrLoc);
 
       DependentSizedExtVectorType *CanonCheck
         = DependentSizedExtVectorTypes.FindNodeOrInsertPos(ID, InsertPos);
@@ -4204,8 +4200,8 @@ ASTContext::getDependentSizedExtVectorType(QualType vecType,
     } else {
       QualType CanonExtTy = getDependentSizedExtVectorType(CanonVecTy, SizeExpr,
                                                            SourceLocation());
-      New = new (*this, TypeAlignment) DependentSizedExtVectorType(
-          *this, vecType, CanonExtTy, SizeExpr, AttrLoc);
+      New = new (*this, TypeAlignment)
+          DependentSizedExtVectorType(vecType, CanonExtTy, SizeExpr, AttrLoc);
     }
   }
 
@@ -4260,7 +4256,7 @@ QualType ASTContext::getDependentSizedMatrixType(QualType ElementTy,
 
   if (!Canon) {
     Canon = new (*this, TypeAlignment) DependentSizedMatrixType(
-        *this, CanonElementTy, QualType(), RowExpr, ColumnExpr, AttrLoc);
+        CanonElementTy, QualType(), RowExpr, ColumnExpr, AttrLoc);
 #ifndef NDEBUG
     DependentSizedMatrixType *CanonCheck =
         DependentSizedMatrixTypes.FindNodeOrInsertPos(ID, InsertPos);
@@ -4279,7 +4275,7 @@ QualType ASTContext::getDependentSizedMatrixType(QualType ElementTy,
 
   // Use Canon as the canonical type for newly-built type.
   DependentSizedMatrixType *New = new (*this, TypeAlignment)
-      DependentSizedMatrixType(*this, ElementTy, QualType(Canon, 0), RowExpr,
+      DependentSizedMatrixType(ElementTy, QualType(Canon, 0), RowExpr,
                                ColumnExpr, AttrLoc);
   Types.push_back(New);
   return QualType(New, 0);
@@ -4301,9 +4297,8 @@ QualType ASTContext::getDependentAddressSpaceType(QualType PointeeType,
     DependentAddressSpaceTypes.FindNodeOrInsertPos(ID, insertPos);
 
   if (!canonTy) {
-    canonTy = new (*this, TypeAlignment)
-      DependentAddressSpaceType(*this, canonPointeeType,
-                                QualType(), AddrSpaceExpr, AttrLoc);
+    canonTy = new (*this, TypeAlignment) DependentAddressSpaceType(
+        canonPointeeType, QualType(), AddrSpaceExpr, AttrLoc);
     DependentAddressSpaceTypes.InsertNode(canonTy, insertPos);
     Types.push_back(canonTy);
   }
@@ -4312,10 +4307,8 @@ QualType ASTContext::getDependentAddressSpaceType(QualType PointeeType,
       canonTy->getAddrSpaceExpr() == AddrSpaceExpr)
     return QualType(canonTy, 0);
 
-  auto *sugaredType
-    = new (*this, TypeAlignment)
-        DependentAddressSpaceType(*this, PointeeType, QualType(canonTy, 0),
-                                  AddrSpaceExpr, AttrLoc);
+  auto *sugaredType = new (*this, TypeAlignment) DependentAddressSpaceType(
+      PointeeType, QualType(canonTy, 0), AddrSpaceExpr, AttrLoc);
   Types.push_back(sugaredType);
   return QualType(sugaredType, 0);
 }
@@ -4619,8 +4612,8 @@ QualType ASTContext::getDependentBitIntType(bool IsUnsigned,
           DependentBitIntTypes.FindNodeOrInsertPos(ID, InsertPos))
     return QualType(Existing, 0);
 
-  auto *New = new (*this, TypeAlignment)
-      DependentBitIntType(*this, IsUnsigned, NumBitsExpr);
+  auto *New =
+      new (*this, TypeAlignment) DependentBitIntType(IsUnsigned, NumBitsExpr);
   DependentBitIntTypes.InsertNode(New, InsertPos);
 
   Types.push_back(New);
@@ -5662,8 +5655,7 @@ QualType ASTContext::getTypeOfExprType(Expr *tofExpr, TypeOfKind Kind) const {
           TypeOfExprType(tofExpr, Kind, QualType((TypeOfExprType *)Canon, 0));
     } else {
       // Build a new, canonical typeof(expr) type.
-      Canon = new (*this, TypeAlignment)
-          DependentTypeOfExprType(*this, tofExpr, Kind);
+      Canon = new (*this, TypeAlignment) DependentTypeOfExprType(tofExpr, Kind);
       DependentTypeOfExprTypes.InsertNode(Canon, InsertPos);
       toe = Canon;
     }
@@ -5731,7 +5723,7 @@ QualType ASTContext::getDecltypeType(Expr *e, QualType UnderlyingType) const {
       = DependentDecltypeTypes.FindNodeOrInsertPos(ID, InsertPos);
     if (!Canon) {
       // Build a new, canonical decltype(expr) type.
-      Canon = new (*this, TypeAlignment) DependentDecltypeType(*this, e);
+      Canon = new (*this, TypeAlignment) DependentDecltypeType(e, DependentTy);
       DependentDecltypeTypes.InsertNode(Canon, InsertPos);
     }
     dt = new (*this, TypeAlignment)
@@ -11768,6 +11760,16 @@ GVALinkage ASTContext::GetGVALinkageForFunction(const FunctionDecl *FD) const {
 
 static GVALinkage basicGVALinkageForVariable(const ASTContext &Context,
                                              const VarDecl *VD) {
+  // As an extension for interactive REPLs, make sure constant variables are
+  // only emitted once instead of LinkageComputer::getLVForNamespaceScopeDecl
+  // marking them as internal.
+  if (Context.getLangOpts().CPlusPlus &&
+      Context.getLangOpts().IncrementalExtensions &&
+      VD->getType().isConstQualified() &&
+      !VD->getType().isVolatileQualified() && !VD->isInline() &&
+      !isa<VarTemplateSpecializationDecl>(VD) && !VD->getDescribedVarTemplate())
+    return GVA_DiscardableODR;
+
   if (!VD->isExternallyVisible())
     return GVA_Internal;
 
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index c7c2aecc8b179a..29b9d492904f5c 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -4521,6 +4521,8 @@ ExpectedDecl ASTNodeImporter::VisitImplicitParamDecl(ImplicitParamDecl *D) {
 Error ASTNodeImporter::ImportDefaultArgOfParmVarDecl(
     const ParmVarDecl *FromParam, ParmVarDecl *ToParam) {
   ToParam->setHasInheritedDefaultArg(FromParam->hasInheritedDefaultArg());
+  ToParam->setExplicitObjectParameterLoc(
+      FromParam->getExplicitObjectParamThisLoc());
   ToParam->setKNRPromoted(FromParam->isKNRPromoted());
 
   if (FromParam->hasUninstantiatedDefaultArg()) {
@@ -8975,6 +8977,10 @@ class AttrImporter {
 public:
   AttrImporter(ASTImporter &I) : Importer(I), NImporter(I) {}
 
+  // Useful for accessing the imported attribute.
+  template <typename T> T *castAttrAs() { return cast<T>(ToAttr); }
+  template <typename T> const T *castAttrAs() const { return cast<T>(ToAttr); }
+
   // Create an "importer" for an attribute parameter.
   // Result of the 'value()' of that object is to be passed to the function
   // 'importAttr', in the order that is expected by the attribute class.
@@ -9181,6 +9187,15 @@ Expected<Attr *> ASTImporter::Import(const Attr *FromAttr) {
                   From->args_size());
     break;
   }
+  case attr::CountedBy: {
+    AI.cloneAttr(FromAttr);
+    const auto *CBA = cast<CountedByAttr>(FromAttr);
+    Expected<SourceRange> SR = Import(CBA->getCountedByFieldLoc()).get();
+    if (!SR)
+      return SR.takeError();
+    AI.castAttrAs<CountedByAttr>()->setCountedByFieldLoc(SR.get());
+    break;
+  }
 
   default: {
     // The default branch works for attributes that have no arguments to import.
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 5b98d14dd3d910..8ad142ff09b788 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -1403,6 +1403,8 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
       Method1->getAccess() == Method2->getAccess() &&
       Method1->getOverloadedOperator() == Method2->getOverloadedOperator() &&
       Method1->isStatic() == Method2->isStatic() &&
+      Method1->isImplicitObjectMemberFunction() ==
+          Method2->isImplicitObjectMemberFunction() &&
       Method1->isConst() == Method2->isConst() &&
       Method1->isVolatile() == Method2->isVolatile() &&
       Method1->isVirtual() == Method2->isVirtual() &&
diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp
index 09df5401d6693a..097753fd3267b5 100644
--- a/clang/lib/AST/ComputeDependence.cpp
+++ b/clang/lib/AST/ComputeDependence.cpp
@@ -484,6 +484,10 @@ ExprDependence clang::computeDependence(DeclRefExpr *E, const ASTContext &Ctx) {
 
   //    - an identifier associated by name lookup with one or more declarations
   //      declared with a dependent type
+  //    - an identifier associated by name lookup with an entity captured by
+  //    copy ([expr.prim.lambda.capture])
+  //      in a lambda-expression that has an explicit object parameter whose
+  //      type is dependent ([dcl.fct]),
   //
   // [The "or more" case is not modeled as a DeclRefExpr. There are a bunch
   // more bullets here that we handle by treating the declaration as having a
@@ -493,6 +497,11 @@ ExprDependence clang::computeDependence(DeclRefExpr *E, const ASTContext &Ctx) {
   else if (Type->isInstantiationDependentType())
     Deps |= ExprDependence::Instantiation;
 
+  //    - an identifier associated by name lookup with an entity captured by
+  //    copy ([expr.prim.lambda.capture])
+  if (E->isCapturedByCopyInLambdaWithExplicitObjectParameter())
+    Deps |= ExprDependence::Type;
+
   //    - a conversion-function-id that specifies a dependent type
   if (Decl->getDeclName().getNameKind() ==
       DeclarationName::CXXConversionFunctionName) {
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index dc02706f649967..b88df1edf845ac 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3652,6 +3652,20 @@ unsigned FunctionDecl::getMinRequiredArguments() const {
   return NumRequiredArgs;
 }
 
+bool FunctionDecl::hasCXXExplicitFunctionObjectParameter() const {
+  return getNumParams() != 0 && getParamDecl(0)->isExplicitObjectParameter();
+}
+
+unsigned FunctionDecl::getNumNonObjectParams() const {
+  return getNumParams() -
+         static_cast<unsigned>(hasCXXExplicitFunctionObjectParameter());
+}
+
+unsigned FunctionDecl::getMinRequiredExplicitArguments() const {
+  return getMinRequiredArguments() -
+         static_cast<unsigned>(hasCXXExplicitFunctionObjectParameter());
+}
+
 bool FunctionDecl::hasOneParamOrDefaultArgs() const {
   return getNumParams() == 1 ||
          (getNumParams() > 1 &&
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 834beef49a4441..46372f97289c30 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -29,7 +29,6 @@
 #include "clang/AST/Type.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LLVM.h"
-#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/ObjCRuntime.h"
 #include "clang/Basic/PartialDiagnostic.h"
@@ -411,6 +410,81 @@ bool Decl::isFileContextDecl() const {
   return DC && DC->isFileContext();
 }
 
+bool Decl::isFlexibleArrayMemberLike(
+    ASTContext &Ctx, const Decl *D, QualType Ty,
+    LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel,
+    bool IgnoreTemplateOrMacroSubstitution) {
+  // For compatibility with existing code, we treat arrays of length 0 or
+  // 1 as flexible array members.
+  const auto *CAT = Ctx.getAsConstantArrayType(Ty);
+  if (CAT) {
+    using FAMKind = LangOptions::StrictFlexArraysLevelKind;
+
+    llvm::APInt Size = CAT->getSize();
+    FAMKind StrictFlexArraysLevel =
+        Ctx.getLangOpts().getStrictFlexArraysLevel();
+
+    if (StrictFlexArraysLevel == FAMKind::IncompleteOnly)
+      return false;
+
+    // GCC extension, only allowed to represent a FAM.
+    if (Size.isZero())
+      return true;
+
+    if (StrictFlexArraysLevel == FAMKind::ZeroOrIncomplete && Size.uge(1))
+      return false;
+
+    if (StrictFlexArraysLevel == FAMKind::OneZeroOrIncomplete && Size.uge(2))
+      return false;
+  } else if (!Ctx.getAsIncompleteArrayType(Ty)) {
+    return false;
+  }
+
+  if (const auto *OID = dyn_cast_if_present<ObjCIvarDecl>(D))
+    return OID->getNextIvar() == nullptr;
+
+  const auto *FD = dyn_cast_if_present<FieldDecl>(D);
+  if (!FD)
+    return false;
+
+  if (CAT) {
+    // GCC treats an array memeber of a union as an FAM if the size is one or
+    // zero.
+    llvm::APInt Size = CAT->getSize();
+    if (FD->getParent()->isUnion() && (Size.isZero() || Size.isOne()))
+      return true;
+  }
+
+  // Don't consider sizes resulting from macro expansions or template argument
+  // substitution to form C89 tail-padded arrays.
+  if (IgnoreTemplateOrMacroSubstitution) {
+    TypeSourceInfo *TInfo = FD->getTypeSourceInfo();
+    while (TInfo) {
+      TypeLoc TL = TInfo->getTypeLoc();
+
+      // Look through typedefs.
+      if (TypedefTypeLoc TTL = TL.getAsAdjusted<TypedefTypeLoc>()) {
+        const TypedefNameDecl *TDL = TTL.getTypedefNameDecl();
+        TInfo = TDL->getTypeSourceInfo();
+        continue;
+      }
+
+      if (auto CTL = TL.getAs<ConstantArrayTypeLoc>()) {
+        const Expr *SizeExpr = dyn_cast<IntegerLiteral>(CTL.getSizeExpr());
+        if (!SizeExpr || SizeExpr->getExprLoc().isMacroID())
+          return false;
+      }
+
+      break;
+    }
+  }
+
+  // Test that the field is the last in the structure.
+  RecordDecl::field_iterator FI(
+      DeclContext::decl_iterator(const_cast<FieldDecl *>(FD)));
+  return ++FI == FD->getParent()->field_end();
+}
+
 TranslationUnitDecl *Decl::getTranslationUnitDecl() {
   if (auto *TUD = dyn_cast<TranslationUnitDecl>(this))
     return TUD;
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 9d9c33820c3640..42bab4ed51b729 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -838,7 +838,7 @@ void CXXRecordDecl::addedMember(Decl *D) {
       SMKind |= SMF_CopyAssignment;
 
       const auto *ParamTy =
-          Method->getParamDecl(0)->getType()->getAs<ReferenceType>();
+          Method->getNonObjectParameter(0)->getType()->getAs<ReferenceType>();
       if (!ParamTy || ParamTy->getPointeeType().isConstQualified())
         data().HasDeclaredCopyAssignmentWithConstParam = true;
     }
@@ -2409,6 +2409,17 @@ bool CXXMethodDecl::isUsualDeallocationFunction(
   return Result;
 }
 
+bool CXXMethodDecl::isExplicitObjectMemberFunction() const {
+  // C++2b [dcl.fct]p6:
+  // An explicit object member function is a non-static member
+  // function with an explicit object parameter
+  return !isStatic() && hasCXXExplicitFunctionObjectParameter();
+}
+
+bool CXXMethodDecl::isImplicitObjectMemberFunction() const {
+  return !isStatic() && !hasCXXExplicitFunctionObjectParameter();
+}
+
 bool CXXMethodDecl::isCopyAssignmentOperator() const {
   // C++0x [class.copy]p17:
   //  A user-declared copy assignment operator X::operator= is a non-static
@@ -2416,11 +2427,12 @@ bool CXXMethodDecl::isCopyAssignmentOperator() const {
   //  type X, X&, const X&, volatile X& or const volatile X&.
   if (/*operator=*/getOverloadedOperator() != OO_Equal ||
       /*non-static*/ isStatic() ||
-      /*non-template*/getPrimaryTemplate() || getDescribedFunctionTemplate() ||
-      getNumParams() != 1)
+
+      /*non-template*/ getPrimaryTemplate() || getDescribedFunctionTemplate() ||
+      getNumExplicitParams() != 1)
     return false;
 
-  QualType ParamType = getParamDecl(0)->getType();
+  QualType ParamType = getNonObjectParameter(0)->getType();
   if (const auto *Ref = ParamType->getAs<LValueReferenceType>())
     ParamType = Ref->getPointeeType();
 
@@ -2437,10 +2449,10 @@ bool CXXMethodDecl::isMoveAssignmentOperator() const {
   //  X&&, const X&&, volatile X&&, or const volatile X&&.
   if (getOverloadedOperator() != OO_Equal || isStatic() ||
       getPrimaryTemplate() || getDescribedFunctionTemplate() ||
-      getNumParams() != 1)
+      getNumExplicitParams() != 1)
     return false;
 
-  QualType ParamType = getParamDecl(0)->getType();
+  QualType ParamType = getNonObjectParameter(0)->getType();
   if (!ParamType->isRValueReferenceType())
     return false;
   ParamType = ParamType->getPointeeType();
@@ -2496,12 +2508,6 @@ QualType CXXMethodDecl::getThisType(const FunctionProtoType *FPT,
                               : C.getPointerType(ObjectTy);
 }
 
-QualType CXXMethodDecl::getThisObjectType(const FunctionProtoType *FPT,
-                                          const CXXRecordDecl *Decl) {
-  ASTContext &C = Decl->getASTContext();
-  return ::getThisObjectType(C, FPT, Decl);
-}
-
 QualType CXXMethodDecl::getThisType() const {
   // C++ 9.3.2p1: The type of this in a member function of a class X is X*.
   // If the member function is declared const, the type of this is const X*,
@@ -2513,11 +2519,17 @@ QualType CXXMethodDecl::getThisType() const {
                                     getParent());
 }
 
-QualType CXXMethodDecl::getThisObjectType() const {
-  // Ditto getThisType.
-  assert(isInstance() && "No 'this' for static methods!");
-  return CXXMethodDecl::getThisObjectType(
-      getType()->castAs<FunctionProtoType>(), getParent());
+QualType CXXMethodDecl::getFunctionObjectParameterReferenceType() const {
+  if (isExplicitObjectMemberFunction())
+    return parameters()[0]->getType();
+
+  ASTContext &C = getParentASTContext();
+  const FunctionProtoType *FPT = getType()->castAs<FunctionProtoType>();
+  QualType Type = ::getThisObjectType(C, FPT, getParent());
+  RefQualifierKind RK = FPT->getRefQualifier();
+  if (RK == RefQualifierKind::RQ_RValue)
+    return C.getRValueReferenceType(Type);
+  return C.getLValueReferenceType(Type);
 }
 
 bool CXXMethodDecl::hasInlineBody() const {
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 748ed3721d23f9..daa219b43c4e6c 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -962,6 +962,10 @@ void DeclPrinter::VisitLabelDecl(LabelDecl *D) {
 void DeclPrinter::VisitVarDecl(VarDecl *D) {
   prettyPrintPragmas(D);
 
+  if (const auto *Param = dyn_cast<ParmVarDecl>(D);
+      Param && Param->isExplicitObjectParameter())
+    Out << "this ";
+
   std::string LeftSide;
   llvm::raw_string_ostream LeftSideStream(LeftSide);
 
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 4f3837371b3fc5..5d3b510df1ef9b 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -205,85 +205,22 @@ bool Expr::isKnownToHaveBooleanValue(bool Semantic) const {
 }
 
 bool Expr::isFlexibleArrayMemberLike(
-    ASTContext &Context,
+    ASTContext &Ctx,
     LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel,
     bool IgnoreTemplateOrMacroSubstitution) const {
-
-  // For compatibility with existing code, we treat arrays of length 0 or
-  // 1 as flexible array members.
-  const auto *CAT = Context.getAsConstantArrayType(getType());
-  if (CAT) {
-    llvm::APInt Size = CAT->getSize();
-
-    using FAMKind = LangOptions::StrictFlexArraysLevelKind;
-
-    if (StrictFlexArraysLevel == FAMKind::IncompleteOnly)
-      return false;
-
-    // GCC extension, only allowed to represent a FAM.
-    if (Size == 0)
-      return true;
-
-    if (StrictFlexArraysLevel == FAMKind::ZeroOrIncomplete && Size.uge(1))
-      return false;
-
-    if (StrictFlexArraysLevel == FAMKind::OneZeroOrIncomplete && Size.uge(2))
-      return false;
-  } else if (!Context.getAsIncompleteArrayType(getType()))
-    return false;
-
   const Expr *E = IgnoreParens();
+  const Decl *D = nullptr;
 
-  const NamedDecl *ND = nullptr;
-  if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
-    ND = DRE->getDecl();
-  else if (const auto *ME = dyn_cast<MemberExpr>(E))
-    ND = ME->getMemberDecl();
+  if (const auto *ME = dyn_cast<MemberExpr>(E))
+    D = ME->getMemberDecl();
+  else if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
+    D = DRE->getDecl();
   else if (const auto *IRE = dyn_cast<ObjCIvarRefExpr>(E))
-    return IRE->getDecl()->getNextIvar() == nullptr;
+    D = IRE->getDecl();
 
-  if (!ND)
-    return false;
-
-  // A flexible array member must be the last member in the class.
-  // FIXME: If the base type of the member expr is not FD->getParent(),
-  // this should not be treated as a flexible array member access.
-  if (const auto *FD = dyn_cast<FieldDecl>(ND)) {
-    // GCC treats an array memeber of a union as an FAM if the size is one or
-    // zero.
-    if (CAT) {
-      llvm::APInt Size = CAT->getSize();
-      if (FD->getParent()->isUnion() && (Size.isZero() || Size.isOne()))
-        return true;
-    }
-
-    // Don't consider sizes resulting from macro expansions or template argument
-    // substitution to form C89 tail-padded arrays.
-    if (IgnoreTemplateOrMacroSubstitution) {
-      TypeSourceInfo *TInfo = FD->getTypeSourceInfo();
-      while (TInfo) {
-        TypeLoc TL = TInfo->getTypeLoc();
-        // Look through typedefs.
-        if (TypedefTypeLoc TTL = TL.getAsAdjusted<TypedefTypeLoc>()) {
-          const TypedefNameDecl *TDL = TTL.getTypedefNameDecl();
-          TInfo = TDL->getTypeSourceInfo();
-          continue;
-        }
-        if (ConstantArrayTypeLoc CTL = TL.getAs<ConstantArrayTypeLoc>()) {
-          const Expr *SizeExpr = dyn_cast<IntegerLiteral>(CTL.getSizeExpr());
-          if (!SizeExpr || SizeExpr->getExprLoc().isMacroID())
-            return false;
-        }
-        break;
-      }
-    }
-
-    RecordDecl::field_iterator FI(
-        DeclContext::decl_iterator(const_cast<FieldDecl *>(FD)));
-    return ++FI == FD->getParent()->field_end();
-  }
-
-  return false;
+  return Decl::isFlexibleArrayMemberLike(Ctx, D, E->getType(),
+                                         StrictFlexArraysLevel,
+                                         IgnoreTemplateOrMacroSubstitution);
 }
 
 const ValueDecl *
@@ -491,6 +428,7 @@ DeclRefExpr::DeclRefExpr(const ASTContext &Ctx, ValueDecl *D,
   DeclRefExprBits.HadMultipleCandidates = false;
   DeclRefExprBits.RefersToEnclosingVariableOrCapture =
       RefersToEnclosingVariableOrCapture;
+  DeclRefExprBits.CapturedByCopyInLambdaWithExplicitObjectParameter = false;
   DeclRefExprBits.NonOdrUseReason = NOUR;
   DeclRefExprBits.IsImmediateEscalating = false;
   DeclRefExprBits.Loc = L;
@@ -518,6 +456,7 @@ DeclRefExpr::DeclRefExpr(const ASTContext &Ctx,
     = (TemplateArgs || TemplateKWLoc.isValid()) ? 1 : 0;
   DeclRefExprBits.RefersToEnclosingVariableOrCapture =
       RefersToEnclosingVariableOrCapture;
+  DeclRefExprBits.CapturedByCopyInLambdaWithExplicitObjectParameter = false;
   DeclRefExprBits.NonOdrUseReason = NOUR;
   if (TemplateArgs) {
     auto Deps = TemplateArgumentDependence::None;
@@ -659,7 +598,7 @@ std::string SYCLUniqueStableNameExpr::ComputeName(ASTContext &Context,
   std::string Buffer;
   Buffer.reserve(128);
   llvm::raw_string_ostream Out(Buffer);
-  Ctx->mangleTypeName(Ty, Out);
+  Ctx->mangleCanonicalTypeName(Ty, Out);
 
   return Out.str();
 }
@@ -1619,6 +1558,10 @@ QualType CallExpr::getCallReturnType(const ASTContext &Ctx) const {
     // This should never be overloaded and so should never return null.
     CalleeType = Expr::findBoundMemberType(Callee);
     assert(!CalleeType.isNull());
+  } else if (CalleeType->isRecordType()) {
+    // If the Callee is a record type, then it is a not-yet-resolved
+    // dependent call to the call operator of that type.
+    return Ctx.DependentTy;
   } else if (CalleeType->isDependentType() ||
              CalleeType->isSpecificPlaceholderType(BuiltinType::Overload)) {
     return Ctx.DependentTy;
diff --git a/clang/lib/AST/ExprClassification.cpp b/clang/lib/AST/ExprClassification.cpp
index 12193b7812f9bf..ffa7c6802ea6e1 100644
--- a/clang/lib/AST/ExprClassification.cpp
+++ b/clang/lib/AST/ExprClassification.cpp
@@ -465,8 +465,13 @@ static Cl::Kinds ClassifyDecl(ASTContext &Ctx, const Decl *D) {
   // lvalue unless it's a reference type (C++ [temp.param]p6), so we need to
   // special-case this.
 
-  if (isa<CXXMethodDecl>(D) && cast<CXXMethodDecl>(D)->isInstance())
-    return Cl::CL_MemberFunction;
+  if (const auto *M = dyn_cast<CXXMethodDecl>(D)) {
+    if (M->isImplicitObjectMemberFunction())
+      return Cl::CL_MemberFunction;
+    if (M->isStatic())
+      return Cl::CL_LValue;
+    return Cl::CL_PRValue;
+  }
 
   bool islvalue;
   if (const auto *NTTParm = dyn_cast<NonTypeTemplateParmDecl>(D))
@@ -551,8 +556,13 @@ static Cl::Kinds ClassifyMemberExpr(ASTContext &Ctx, const MemberExpr *E) {
   //      -- If it refers to a static member function [...], then E1.E2 is an
   //         lvalue; [...]
   //      -- Otherwise [...] E1.E2 is a prvalue.
-  if (const auto *Method = dyn_cast<CXXMethodDecl>(Member))
-    return Method->isStatic() ? Cl::CL_LValue : Cl::CL_MemberFunction;
+  if (const auto *Method = dyn_cast<CXXMethodDecl>(Member)) {
+    if (Method->isStatic())
+      return Cl::CL_LValue;
+    if (Method->isImplicitObjectMemberFunction())
+      return Cl::CL_MemberFunction;
+    return Cl::CL_PRValue;
+  }
 
   //   -- If E2 is a member enumerator [...], the expression E1.E2 is a prvalue.
   // So is everything else we haven't handled yet.
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index f7a99bdae18f98..5a33e918db8e8c 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -1945,9 +1945,9 @@ APValue *EvalInfo::createHeapAlloc(const Expr *E, QualType T, LValue &LV) {
 /// Produce a string describing the given constexpr call.
 void CallStackFrame::describe(raw_ostream &Out) const {
   unsigned ArgIndex = 0;
-  bool IsMemberCall = isa<CXXMethodDecl>(Callee) &&
-                      !isa<CXXConstructorDecl>(Callee) &&
-                      cast<CXXMethodDecl>(Callee)->isInstance();
+  bool IsMemberCall =
+      isa<CXXMethodDecl>(Callee) && !isa<CXXConstructorDecl>(Callee) &&
+      cast<CXXMethodDecl>(Callee)->isImplicitObjectMemberFunction();
 
   if (!IsMemberCall)
     Callee->getNameForDiagnostic(Out, Info.Ctx.getPrintingPolicy(),
@@ -3357,6 +3357,9 @@ static bool evaluateVarDeclInit(EvalInfo &Info, const Expr *E,
     return false;
   }
 
+  if (E->isValueDependent())
+    return false;
+
   // Dig out the initializer, and use the declaration which it's attached to.
   // FIXME: We should eventually check whether the variable has a reachable
   // initializing declaration.
@@ -4711,6 +4714,9 @@ static bool EvaluateObjectArgument(EvalInfo &Info, const Expr *Object,
   if (Object->getType()->isLiteralType(Info.Ctx))
     return EvaluateTemporary(Object, This, Info);
 
+  if (Object->getType()->isRecordType() && Object->isPRValue())
+    return EvaluateTemporary(Object, This, Info);
+
   Info.FFDiag(Object, diag::note_constexpr_nonliteral) << Object->getType();
   return false;
 }
@@ -7789,15 +7795,18 @@ class ExprEvaluatorBase
       if (OCE && OCE->isAssignmentOp()) {
         assert(Args.size() == 2 && "wrong number of arguments in assignment");
         Call = Info.CurrentCall->createCall(FD);
-        if (!EvaluateArgs(isa<CXXMethodDecl>(FD) ? Args.slice(1) : Args, Call,
-                          Info, FD, /*RightToLeft=*/true))
+        bool HasThis = false;
+        if (const auto *MD = dyn_cast<CXXMethodDecl>(FD))
+          HasThis = MD->isImplicitObjectMemberFunction();
+        if (!EvaluateArgs(HasThis ? Args.slice(1) : Args, Call, Info, FD,
+                          /*RightToLeft=*/true))
           return false;
       }
 
       // Overloaded operator calls to member functions are represented as normal
       // calls with '*this' as the first argument.
       const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD);
-      if (MD && !MD->isStatic()) {
+      if (MD && MD->isImplicitObjectMemberFunction()) {
         // FIXME: When selecting an implicit conversion for an overloaded
         // operator delete, we sometimes try to evaluate calls to conversion
         // operators without a 'this' parameter!
@@ -7881,7 +7890,7 @@ class ExprEvaluatorBase
                                    CovariantAdjustmentPath);
         if (!FD)
           return false;
-      } else {
+      } else if (NamedMember && NamedMember->isImplicitObjectMemberFunction()) {
         // Check that the 'this' pointer points to an object of the right type.
         // FIXME: If this is an assignment operator call, we may need to change
         // the active union member before we check this.
@@ -8361,7 +8370,13 @@ bool LValueExprEvaluator::VisitVarDecl(const Expr *E, const VarDecl *VD) {
 
     if (auto *FD = Info.CurrentCall->LambdaCaptureFields.lookup(VD)) {
       // Start with 'Result' referring to the complete closure object...
-      Result = *Info.CurrentCall->This;
+      if (auto *MD = cast<CXXMethodDecl>(Info.CurrentCall->Callee);
+          MD->isExplicitObjectMemberFunction()) {
+        APValue *RefValue =
+            Info.getParamSlot(Info.CurrentCall->Arguments, MD->getParamDecl(0));
+        Result.setFrom(Info.Ctx, *RefValue);
+      } else
+        Result = *Info.CurrentCall->This;
       // ... then update it to refer to the field of the closure object
       // that represents the capture.
       if (!HandleLValueMember(Info, E, Result, FD))
@@ -16284,7 +16299,8 @@ bool Expr::EvaluateWithSubstitution(APValue &Value, ASTContext &Ctx,
 #ifndef NDEBUG
     auto *MD = dyn_cast<CXXMethodDecl>(Callee);
     assert(MD && "Don't provide `this` for non-methods.");
-    assert(!MD->isStatic() && "Don't provide `this` for static methods.");
+    assert(MD->isImplicitObjectMemberFunction() &&
+           "Don't provide `this` for methods without an implicit object.");
 #endif
     if (!This->isValueDependent() &&
         EvaluateObjectArgument(Info, This, ThisVal) &&
@@ -16379,9 +16395,10 @@ bool Expr::isPotentialConstantExpr(const FunctionDecl *FD,
     HandleConstructorCall(&VIE, This, Args, CD, Info, Scratch);
   } else {
     SourceLocation Loc = FD->getLocation();
-    HandleFunctionCall(Loc, FD, (MD && MD->isInstance()) ? &This : nullptr,
-                       &VIE, Args, CallRef(), FD->getBody(), Info, Scratch,
-                       /*ResultSlot=*/nullptr);
+    HandleFunctionCall(
+        Loc, FD, (MD && MD->isImplicitObjectMemberFunction()) ? &This : nullptr,
+        &VIE, Args, CallRef(), FD->getBody(), Info, Scratch,
+        /*ResultSlot=*/nullptr);
   }
 
   return Diags.empty();
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
index 23c28b430d81c2..a634ee288f57c9 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
@@ -44,7 +44,7 @@ ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
   // InterpStack when calling the function.
   bool HasThisPointer = false;
   if (const auto *MD = dyn_cast<CXXMethodDecl>(FuncDecl)) {
-    if (MD->isInstance()) {
+    if (MD->isImplicitObjectMemberFunction()) {
       HasThisPointer = true;
       ParamTypes.push_back(PT_Ptr);
       ParamOffsets.push_back(ParamOffset);
@@ -64,8 +64,8 @@ ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
         this->LambdaCaptures[Cap.first] = {
             Offset, Cap.second->getType()->isReferenceType()};
       }
-      // FIXME: LambdaThisCapture
-      (void)LTC;
+      if (LTC)
+        this->LambdaThisCapture = R->getField(LTC)->Offset;
     }
   }
 
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.h b/clang/lib/AST/Interp/ByteCodeEmitter.h
index d184484a9197f9..5520f8c3006106 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.h
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.h
@@ -67,7 +67,8 @@ class ByteCodeEmitter {
   llvm::DenseMap<const ParmVarDecl *, ParamOffset> Params;
   /// Lambda captures.
   llvm::DenseMap<const ValueDecl *, ParamOffset> LambdaCaptures;
-  unsigned LambdaThisCapture;
+  /// Offset of the This parameter in a lambda record.
+  unsigned LambdaThisCapture = 0;
   /// Local descriptors.
   llvm::SmallVector<SmallVector<Local, 8>, 2> Descriptors;
 
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index e266804a4e75de..3cf8bf874b1d21 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -561,7 +561,7 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
         if (!this->visitInitializer(Init))
           return false;
 
-        if (!this->emitPopPtr(E))
+        if (!this->emitInitPtrPop(E))
           return false;
         // Base initializers don't increase InitIndex, since they don't count
         // into the Record's fields.
@@ -1718,7 +1718,7 @@ bool ByteCodeExprGen<Emitter>::visitZeroRecordInitializer(const Record *R,
       return false;
     if (!this->visitZeroRecordInitializer(B.R, E))
       return false;
-    if (!this->emitPopPtr(E))
+    if (!this->emitInitPtrPop(E))
       return false;
   }
 
@@ -2305,6 +2305,10 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   if (DiscardResult)
     return true;
+
+  if (this->LambdaThisCapture > 0)
+    return this->emitGetThisFieldPtr(this->LambdaThisCapture, E);
+
   return this->emitThis(E);
 }
 
@@ -2536,7 +2540,8 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   // This happens in C.
   if (!Ctx.getLangOpts().CPlusPlus) {
     if (const auto *VD = dyn_cast<VarDecl>(D);
-        VD && VD->hasGlobalStorage() && VD->getAnyInitializer()) {
+        VD && VD->hasGlobalStorage() && VD->getAnyInitializer() &&
+        VD->getType().isConstQualified()) {
       if (!this->visitVarDecl(VD))
         return false;
       // Retry.
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 15eae8e20b3a67..a81c82c38955e9 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -191,7 +191,7 @@ bool ByteCodeStmtGen<Emitter>::visitFunc(const FunctionDecl *F) {
           return false;
         if (!this->visitInitializer(InitExpr))
           return false;
-        if (!this->emitPopPtr(InitExpr))
+        if (!this->emitInitPtrPop(InitExpr))
           return false;
       }
     }
diff --git a/clang/lib/AST/Interp/EvalEmitter.h b/clang/lib/AST/Interp/EvalEmitter.h
index c63e46bbf347bd..1128e0fb09f0de 100644
--- a/clang/lib/AST/Interp/EvalEmitter.h
+++ b/clang/lib/AST/Interp/EvalEmitter.h
@@ -73,7 +73,8 @@ class EvalEmitter : public SourceMapper {
   llvm::DenseMap<const ParmVarDecl *, ParamOffset> Params;
   /// Lambda captures.
   llvm::DenseMap<const ValueDecl *, ParamOffset> LambdaCaptures;
-  unsigned LambdaThisCapture;
+  /// Offset of the This parameter in a lambda record.
+  unsigned LambdaThisCapture = 0;
   /// Local descriptors.
   llvm::SmallVector<SmallVector<Local, 8>, 2> Descriptors;
 
diff --git a/clang/lib/AST/Interp/IntegralAP.h b/clang/lib/AST/Interp/IntegralAP.h
index 53ad76d8a1997e..a8df431bef1178 100644
--- a/clang/lib/AST/Interp/IntegralAP.h
+++ b/clang/lib/AST/Interp/IntegralAP.h
@@ -32,14 +32,22 @@ template <unsigned Bits, bool Signed> class Integral;
 class Boolean;
 
 template <bool Signed> class IntegralAP final {
-public:
+private:
+  friend IntegralAP<!Signed>;
   APSInt V;
 
+  template <typename T> static T truncateCast(const APSInt &V) {
+    return std::is_signed_v<T> ? V.trunc(sizeof(T) * 8).getSExtValue()
+                               : V.trunc(sizeof(T) * 8).getZExtValue();
+  }
+
 public:
   using AsUnsigned = IntegralAP<false>;
 
   template <typename T>
-  IntegralAP(T Value) : V(APInt(sizeof(T) * 8, Value, std::is_signed_v<T>)) {}
+  IntegralAP(T Value)
+      : V(APInt(sizeof(T) * 8, static_cast<uint64_t>(Value),
+                std::is_signed_v<T>)) {}
 
   IntegralAP(APInt V) : V(V) {}
   IntegralAP(APSInt V) : V(V) {}
@@ -53,18 +61,18 @@ template <bool Signed> class IntegralAP final {
   bool operator<=(IntegralAP RHS) const { return V <= RHS.V; }
 
   explicit operator bool() const { return !V.isZero(); }
-  explicit operator int8_t() const { return V.getSExtValue(); }
-  explicit operator uint8_t() const { return V.getZExtValue(); }
-  explicit operator int16_t() const { return V.getSExtValue(); }
-  explicit operator uint16_t() const { return V.getZExtValue(); }
-  explicit operator int32_t() const { return V.getSExtValue(); }
-  explicit operator uint32_t() const { return V.getZExtValue(); }
-  explicit operator int64_t() const { return V.getSExtValue(); }
-  explicit operator uint64_t() const { return V.getZExtValue(); }
+  explicit operator int8_t() const { return truncateCast<int8_t>(V); }
+  explicit operator uint8_t() const { return truncateCast<uint8_t>(V); }
+  explicit operator int16_t() const { return truncateCast<int16_t>(V); }
+  explicit operator uint16_t() const { return truncateCast<uint16_t>(V); }
+  explicit operator int32_t() const { return truncateCast<int32_t>(V); }
+  explicit operator uint32_t() const { return truncateCast<uint32_t>(V); }
+  explicit operator int64_t() const { return truncateCast<int64_t>(V); }
+  explicit operator uint64_t() const { return truncateCast<uint64_t>(V); }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
     assert(NumBits > 0);
-    APSInt Copy = APSInt(APInt(NumBits, Value, Signed), !Signed);
+    APSInt Copy = APSInt(APInt(NumBits, static_cast<int64_t>(Value), Signed), !Signed);
 
     return IntegralAP<Signed>(Copy);
   }
@@ -208,7 +216,6 @@ template <bool Signed> class IntegralAP final {
   }
 
   static bool comp(IntegralAP A, IntegralAP *R) {
-    assert(false);
     *R = IntegralAP(~A.V);
     return false;
   }
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index e1951574edb628..8e851595963184 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -467,6 +467,12 @@ static bool CheckFieldsInitialized(InterpState &S, CodePtr OpPC,
   // Check Fields in all bases
   for (const Record::Base &B : R->bases()) {
     Pointer P = BasePtr.atField(B.Offset);
+    if (!P.isInitialized()) {
+      S.FFDiag(BasePtr.getDeclDesc()->asDecl()->getLocation(),
+               diag::note_constexpr_uninitialized_base)
+          << B.Desc->getType();
+      return false;
+    }
     Result &= CheckFieldsInitialized(S, OpPC, P, B.R);
   }
 
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 9d5ec3315415cf..d62e64bedb213a 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1245,6 +1245,12 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
   return true;
 }
 
+inline bool InitPtrPop(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  Ptr.initialize();
+  return true;
+}
+
 inline bool VirtBaseHelper(InterpState &S, CodePtr OpPC, const RecordDecl *Decl,
                            const Pointer &Ptr) {
   Pointer Base = Ptr;
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index d816145598049b..bba0255219bc0d 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -398,6 +398,16 @@ static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC,
+                                     const InterpFrame *Frame,
+                                     const Function *Func,
+                                     const CallExpr *Call) {
+  PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
+  APSInt Val = peekToAPSInt(S.Stk, ArgT);
+  pushInt(S, Val.popcount());
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call) {
   InterpFrame *Frame = S.Current;
@@ -513,6 +523,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return Ret<PT_Float>(S, OpPC, Dummy);
     break;
 
+  case Builtin::BI__builtin_popcount:
+  case Builtin::BI__builtin_popcountl:
+  case Builtin::BI__builtin_popcountll:
+  case Builtin::BI__popcnt16: // Microsoft variants of popcount
+  case Builtin::BI__popcnt:
+  case Builtin::BI__popcnt64:
+    if (interp__builtin_popcount(S, OpPC, Frame, F, Call))
+      return retInt(S, OpPC, Dummy);
+    break;
+
   default:
     return false;
   }
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 9fc4938bb37bde..50b6c0ac154de3 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -304,6 +304,10 @@ def GetPtrBasePop : Opcode {
   let Args = [ArgUint32];
 }
 
+def InitPtrPop : Opcode {
+  let Args = [];
+}
+
 def GetPtrDerivedPop : Opcode {
   let Args = [ArgUint32];
 }
diff --git a/clang/lib/AST/Interp/Source.h b/clang/lib/AST/Interp/Source.h
index e97e897e72394d..fec950dd544f8a 100644
--- a/clang/lib/AST/Interp/Source.h
+++ b/clang/lib/AST/Interp/Source.h
@@ -14,12 +14,12 @@
 #define LLVM_CLANG_AST_INTERP_SOURCE_H
 
 #include "PrimType.h"
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/Stmt.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/Support/Endian.h"
 
 namespace clang {
-class Stmt;
-class Decl;
 class Expr;
 class SourceLocation;
 class SourceRange;
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 830a35b98d6072..23ec35cae4b7b4 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -112,8 +112,8 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext {
   void mangleCXXRTTI(QualType T, raw_ostream &) override;
   void mangleCXXRTTIName(QualType T, raw_ostream &,
                          bool NormalizeIntegers) override;
-  void mangleTypeName(QualType T, raw_ostream &,
-                      bool NormalizeIntegers) override;
+  void mangleCanonicalTypeName(QualType T, raw_ostream &,
+                               bool NormalizeIntegers) override;
 
   void mangleCXXCtorComdat(const CXXConstructorDecl *D, raw_ostream &) override;
   void mangleCXXDtorComdat(const CXXDestructorDecl *D, raw_ostream &) override;
@@ -1721,7 +1721,7 @@ void CXXNameMangler::mangleUnqualifiedName(
 
       // If we have a member function, we need to include the 'this' pointer.
       if (const auto *MD = dyn_cast<CXXMethodDecl>(ND))
-        if (!MD->isStatic())
+        if (MD->isImplicitObjectMemberFunction())
           Arity++;
     }
     [[fallthrough]];
@@ -1781,6 +1781,8 @@ void CXXNameMangler::mangleNestedName(GlobalDecl GD,
     Qualifiers MethodQuals = Method->getMethodQualifiers();
     // We do not consider restrict a distinguishing attribute for overloading
     // purposes so we must not mangle it.
+    if (Method->isExplicitObjectMemberFunction())
+      Out << 'H';
     MethodQuals.removeRestrict();
     mangleQualifiers(MethodQuals);
     mangleRefQualifier(Method->getRefQualifier());
@@ -7130,8 +7132,8 @@ void ItaniumMangleContextImpl::mangleCXXRTTIName(
   Mangler.mangleType(Ty);
 }
 
-void ItaniumMangleContextImpl::mangleTypeName(QualType Ty, raw_ostream &Out,
-                                              bool NormalizeIntegers = false) {
+void ItaniumMangleContextImpl::mangleCanonicalTypeName(
+    QualType Ty, raw_ostream &Out, bool NormalizeIntegers = false) {
   mangleCXXRTTIName(Ty, Out, NormalizeIntegers);
 }
 
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index e67c2c7e216dce..beb07015f0bcbb 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -876,6 +876,9 @@ void JSONNodeDumper::VisitUsingShadowDecl(const UsingShadowDecl *USD) {
 void JSONNodeDumper::VisitVarDecl(const VarDecl *VD) {
   VisitNamedDecl(VD);
   JOS.attribute("type", createQualType(VD->getType()));
+  if (const auto *P = dyn_cast<ParmVarDecl>(VD))
+    attributeOnlyIfTrue("explicitObjectParameter",
+                        P->isExplicitObjectParameter());
 
   StorageClass SC = VD->getStorageClass();
   if (SC != SC_None)
diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp
index 53af9fc4d51897..64c971912a91d0 100644
--- a/clang/lib/AST/Mangle.cpp
+++ b/clang/lib/AST/Mangle.cpp
@@ -225,7 +225,7 @@ void MangleContext::mangleName(GlobalDecl GD, raw_ostream &Out) {
   assert(!Proto->isVariadic());
   unsigned ArgWords = 0;
   if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD))
-    if (!MD->isStatic())
+    if (MD->isImplicitObjectMemberFunction())
       ++ArgWords;
   uint64_t DefaultPtrWidth = TI.getPointerWidth(LangAS::Default);
   for (const auto &AT : Proto->param_types()) {
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 79175c79de96bf..f6d8cdee8443d5 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -196,8 +196,8 @@ class MicrosoftMangleContextImpl : public MicrosoftMangleContext {
   mangleCXXRTTICompleteObjectLocator(const CXXRecordDecl *Derived,
                                      ArrayRef<const CXXRecordDecl *> BasePath,
                                      raw_ostream &Out) override;
-  void mangleTypeName(QualType T, raw_ostream &,
-                      bool NormalizeIntegers) override;
+  void mangleCanonicalTypeName(QualType T, raw_ostream &,
+                               bool NormalizeIntegers) override;
   void mangleReferenceTemporary(const VarDecl *, unsigned ManglingNumber,
                                 raw_ostream &) override;
   void mangleStaticGuardVariable(const VarDecl *D, raw_ostream &Out) override;
@@ -2636,7 +2636,7 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T,
   if (const CXXMethodDecl *MD = dyn_cast_or_null<CXXMethodDecl>(D)) {
     if (MD->getParent()->isLambda())
       IsInLambda = true;
-    if (MD->isInstance())
+    if (MD->isImplicitObjectMemberFunction())
       HasThisQuals = true;
     if (isa<CXXDestructorDecl>(MD)) {
       IsStructor = true;
@@ -2745,6 +2745,10 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T,
   } else {
     // Happens for function pointer type arguments for example.
     for (unsigned I = 0, E = Proto->getNumParams(); I != E; ++I) {
+      // Explicit object parameters are prefixed by "_V".
+      if (I == 0 && D && D->getParamDecl(I)->isExplicitObjectParameter())
+        Out << "_V";
+
       mangleFunctionArgumentType(Proto->getParamType(I), Range);
       // Mangle each pass_object_size parameter as if it's a parameter of enum
       // type passed directly after the parameter with the pass_object_size
@@ -2810,7 +2814,7 @@ void MicrosoftCXXNameMangler::mangleFunctionClass(const FunctionDecl *FD) {
       case AS_none:
         llvm_unreachable("Unsupported access specifier");
       case AS_private:
-        if (MD->isStatic())
+        if (!MD->isImplicitObjectMemberFunction())
           Out << 'C';
         else if (IsVirtual)
           Out << 'E';
@@ -2818,7 +2822,7 @@ void MicrosoftCXXNameMangler::mangleFunctionClass(const FunctionDecl *FD) {
           Out << 'A';
         break;
       case AS_protected:
-        if (MD->isStatic())
+        if (!MD->isImplicitObjectMemberFunction())
           Out << 'K';
         else if (IsVirtual)
           Out << 'M';
@@ -2826,7 +2830,7 @@ void MicrosoftCXXNameMangler::mangleFunctionClass(const FunctionDecl *FD) {
           Out << 'I';
         break;
       case AS_public:
-        if (MD->isStatic())
+        if (!MD->isImplicitObjectMemberFunction())
           Out << 'S';
         else if (IsVirtual)
           Out << 'U';
@@ -3838,13 +3842,13 @@ void MicrosoftMangleContextImpl::mangleSEHFinallyBlock(
   Mangler.mangleName(EnclosingDecl);
 }
 
-void MicrosoftMangleContextImpl::mangleTypeName(
+void MicrosoftMangleContextImpl::mangleCanonicalTypeName(
     QualType T, raw_ostream &Out, bool NormalizeIntegers = false) {
   // This is just a made up unique string for the purposes of tbaa.  undname
   // does *not* know how to demangle it.
   MicrosoftCXXNameMangler Mangler(*this, Out);
   Mangler.getStream() << '?';
-  Mangler.mangleType(T, SourceRange());
+  Mangler.mangleType(T.getCanonicalType(), SourceRange());
 }
 
 void MicrosoftMangleContextImpl::mangleReferenceTemporary(
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index f5ad75028a641e..b95b4fce180e73 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -170,6 +170,7 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
   case OMPC_affinity:
   case OMPC_when:
   case OMPC_bind:
+  case OMPC_ompx_bare:
     break;
   default:
     break;
@@ -2546,6 +2547,10 @@ void OMPClausePrinter::VisitOMPXAttributeClause(OMPXAttributeClause *Node) {
   OS << ")";
 }
 
+void OMPClausePrinter::VisitOMPXBareClause(OMPXBareClause *Node) {
+  OS << "ompx_bare";
+}
+
 void OMPTraitInfo::getAsVariantMatchInfo(ASTContext &ASTCtx,
                                          VariantMatchInfo &VMI) const {
   for (const OMPTraitSet &Set : Sets) {
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 2e4f15f83ac26e..22b6855b0fff23 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -930,6 +930,7 @@ void OMPClauseProfiler::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
 }
 void OMPClauseProfiler::VisitOMPXAttributeClause(const OMPXAttributeClause *C) {
 }
+void OMPClauseProfiler::VisitOMPXBareClause(const OMPXBareClause *C) {}
 } // namespace
 
 void
@@ -1335,6 +1336,8 @@ void StmtProfiler::VisitIntegerLiteral(const IntegerLiteral *S) {
   S->getValue().Profile(ID);
 
   QualType T = S->getType();
+  if (Canonical)
+    T = T.getCanonicalType();
   ID.AddInteger(T->getTypeClass());
   if (auto BitIntT = T->getAs<BitIntType>())
     BitIntT->Profile(ID);
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 233dc6c6359515..15eb6c6191edf2 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1949,6 +1949,10 @@ void TextNodeDumper::VisitFieldDecl(const FieldDecl *D) {
 void TextNodeDumper::VisitVarDecl(const VarDecl *D) {
   dumpNestedNameSpecifier(D->getQualifier());
   dumpName(D);
+  if (const auto *P = dyn_cast<ParmVarDecl>(D);
+      P && P->isExplicitObjectParameter())
+    OS << " this";
+
   dumpType(D->getType());
   dumpTemplateSpecializationKind(D->getTemplateSpecializationKind());
   StorageClass SC = D->getStorageClass();
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index c08ebfb7f142b3..4c433f7fe9daca 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -225,13 +225,12 @@ void ConstantArrayType::Profile(llvm::FoldingSetNodeID &ID,
     SizeExpr->Profile(ID, Context, true);
 }
 
-DependentSizedArrayType::DependentSizedArrayType(const ASTContext &Context,
-                                                 QualType et, QualType can,
+DependentSizedArrayType::DependentSizedArrayType(QualType et, QualType can,
                                                  Expr *e, ArraySizeModifier sm,
                                                  unsigned tq,
                                                  SourceRange brackets)
-    : ArrayType(DependentSizedArray, et, can, sm, tq, e),
-      Context(Context), SizeExpr((Stmt*) e), Brackets(brackets) {}
+    : ArrayType(DependentSizedArray, et, can, sm, tq, e), SizeExpr((Stmt *)e),
+      Brackets(brackets) {}
 
 void DependentSizedArrayType::Profile(llvm::FoldingSetNodeID &ID,
                                       const ASTContext &Context,
@@ -245,8 +244,7 @@ void DependentSizedArrayType::Profile(llvm::FoldingSetNodeID &ID,
   E->Profile(ID, Context, true);
 }
 
-DependentVectorType::DependentVectorType(const ASTContext &Context,
-                                         QualType ElementType,
+DependentVectorType::DependentVectorType(QualType ElementType,
                                          QualType CanonType, Expr *SizeExpr,
                                          SourceLocation Loc,
                                          VectorType::VectorKind VecKind)
@@ -255,7 +253,7 @@ DependentVectorType::DependentVectorType(const ASTContext &Context,
                ElementType->getDependence() |
                (SizeExpr ? toTypeDependence(SizeExpr->getDependence())
                          : TypeDependence::None)),
-      Context(Context), ElementType(ElementType), SizeExpr(SizeExpr), Loc(Loc) {
+      ElementType(ElementType), SizeExpr(SizeExpr), Loc(Loc) {
   VectorTypeBits.VecKind = VecKind;
 }
 
@@ -268,16 +266,16 @@ void DependentVectorType::Profile(llvm::FoldingSetNodeID &ID,
   SizeExpr->Profile(ID, Context, true);
 }
 
-DependentSizedExtVectorType::DependentSizedExtVectorType(
-    const ASTContext &Context, QualType ElementType, QualType can,
-    Expr *SizeExpr, SourceLocation loc)
+DependentSizedExtVectorType::DependentSizedExtVectorType(QualType ElementType,
+                                                         QualType can,
+                                                         Expr *SizeExpr,
+                                                         SourceLocation loc)
     : Type(DependentSizedExtVector, can,
            TypeDependence::DependentInstantiation |
                ElementType->getDependence() |
                (SizeExpr ? toTypeDependence(SizeExpr->getDependence())
                          : TypeDependence::None)),
-      Context(Context), SizeExpr(SizeExpr), ElementType(ElementType), loc(loc) {
-}
+      SizeExpr(SizeExpr), ElementType(ElementType), loc(loc) {}
 
 void
 DependentSizedExtVectorType::Profile(llvm::FoldingSetNodeID &ID,
@@ -287,8 +285,7 @@ DependentSizedExtVectorType::Profile(llvm::FoldingSetNodeID &ID,
   SizeExpr->Profile(ID, Context, true);
 }
 
-DependentAddressSpaceType::DependentAddressSpaceType(const ASTContext &Context,
-                                                     QualType PointeeType,
+DependentAddressSpaceType::DependentAddressSpaceType(QualType PointeeType,
                                                      QualType can,
                                                      Expr *AddrSpaceExpr,
                                                      SourceLocation loc)
@@ -297,8 +294,7 @@ DependentAddressSpaceType::DependentAddressSpaceType(const ASTContext &Context,
                PointeeType->getDependence() |
                (AddrSpaceExpr ? toTypeDependence(AddrSpaceExpr->getDependence())
                               : TypeDependence::None)),
-      Context(Context), AddrSpaceExpr(AddrSpaceExpr), PointeeType(PointeeType),
-      loc(loc) {}
+      AddrSpaceExpr(AddrSpaceExpr), PointeeType(PointeeType), loc(loc) {}
 
 void DependentAddressSpaceType::Profile(llvm::FoldingSetNodeID &ID,
                                         const ASTContext &Context,
@@ -337,12 +333,14 @@ ConstantMatrixType::ConstantMatrixType(TypeClass tc, QualType matrixType,
     : MatrixType(tc, matrixType, canonType), NumRows(nRows),
       NumColumns(nColumns) {}
 
-DependentSizedMatrixType::DependentSizedMatrixType(
-    const ASTContext &CTX, QualType ElementType, QualType CanonicalType,
-    Expr *RowExpr, Expr *ColumnExpr, SourceLocation loc)
+DependentSizedMatrixType::DependentSizedMatrixType(QualType ElementType,
+                                                   QualType CanonicalType,
+                                                   Expr *RowExpr,
+                                                   Expr *ColumnExpr,
+                                                   SourceLocation loc)
     : MatrixType(DependentSizedMatrix, ElementType, CanonicalType, RowExpr,
                  ColumnExpr),
-      Context(CTX), RowExpr(RowExpr), ColumnExpr(ColumnExpr), loc(loc) {}
+      RowExpr(RowExpr), ColumnExpr(ColumnExpr), loc(loc) {}
 
 void DependentSizedMatrixType::Profile(llvm::FoldingSetNodeID &ID,
                                        const ASTContext &CTX,
@@ -368,11 +366,10 @@ BitIntType::BitIntType(bool IsUnsigned, unsigned NumBits)
     : Type(BitInt, QualType{}, TypeDependence::None), IsUnsigned(IsUnsigned),
       NumBits(NumBits) {}
 
-DependentBitIntType::DependentBitIntType(const ASTContext &Context,
-                                         bool IsUnsigned, Expr *NumBitsExpr)
+DependentBitIntType::DependentBitIntType(bool IsUnsigned, Expr *NumBitsExpr)
     : Type(DependentBitInt, QualType{},
            toTypeDependence(NumBitsExpr->getDependence())),
-      Context(Context), ExprAndUnsigned(NumBitsExpr, IsUnsigned) {}
+      ExprAndUnsigned(NumBitsExpr, IsUnsigned) {}
 
 bool DependentBitIntType::isUnsigned() const {
   return ExprAndUnsigned.getInt();
@@ -3726,8 +3723,8 @@ QualType DecltypeType::desugar() const {
   return QualType(this, 0);
 }
 
-DependentDecltypeType::DependentDecltypeType(const ASTContext &Context, Expr *E)
-    : DecltypeType(E, Context.DependentTy), Context(Context) {}
+DependentDecltypeType::DependentDecltypeType(Expr *E, QualType UnderlyingType)
+    : DecltypeType(E, UnderlyingType) {}
 
 void DependentDecltypeType::Profile(llvm::FoldingSetNodeID &ID,
                                     const ASTContext &Context, Expr *E) {
diff --git a/clang/lib/Analysis/Consumed.cpp b/clang/lib/Analysis/Consumed.cpp
index d90a8a966b210b..d01c7f688e8b5d 100644
--- a/clang/lib/Analysis/Consumed.cpp
+++ b/clang/lib/Analysis/Consumed.cpp
@@ -771,7 +771,7 @@ void ConsumedStmtVisitor::VisitCXXBindTemporaryExpr(
 void ConsumedStmtVisitor::VisitCXXConstructExpr(const CXXConstructExpr *Call) {
   CXXConstructorDecl *Constructor = Call->getConstructor();
 
-  QualType ThisType = Constructor->getThisObjectType();
+  QualType ThisType = Constructor->getFunctionObjectParameterType();
 
   if (!isConsumableType(ThisType))
     return;
@@ -1199,7 +1199,7 @@ void ConsumedAnalyzer::determineExpectedReturnState(AnalysisDeclContext &AC,
                                                     const FunctionDecl *D) {
   QualType ReturnType;
   if (const auto *Constructor = dyn_cast<CXXConstructorDecl>(D)) {
-    ReturnType = Constructor->getThisObjectType();
+    ReturnType = Constructor->getFunctionObjectParameterType();
   } else
     ReturnType = D->getCallResultType();
 
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 1ea40927ac58ad..66d98c99546859 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -452,8 +452,8 @@ Environment::Environment(DataflowAnalysisContext &DACtx,
       MethodDecl = dyn_cast<CXXMethodDecl>(Parent->getDeclContext());
 
     // FIXME: Initialize the ThisPointeeLoc of lambdas too.
-    if (MethodDecl && !MethodDecl->isStatic()) {
-      QualType ThisPointeeType = MethodDecl->getThisObjectType();
+    if (MethodDecl && MethodDecl->isImplicitObjectMemberFunction()) {
+      QualType ThisPointeeType = MethodDecl->getFunctionObjectParameterType();
       ThisPointeeLoc =
           &cast<RecordValue>(createValue(ThisPointeeType))->getLoc();
     }
diff --git a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
index 47915958750d1f..8aef1d6f46089d 100644
--- a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
+++ b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
@@ -150,6 +150,7 @@ class HTMLLogger : public Logger {
     const CFGBlock *Block;
     unsigned Iter;
     bool PostVisit;
+    bool Converged;
   };
 
   StreamFactory Streams;
@@ -159,8 +160,8 @@ class HTMLLogger : public Logger {
   const ControlFlowContext *CFG;
   // Timeline of iterations of CFG block visitation.
   std::vector<Iteration> Iters;
-  // Number of times each CFG block has been seen.
-  llvm::DenseMap<const CFGBlock *, llvm::SmallVector<Iteration>> BlockIters;
+  // Indexes  in `Iters` of the iterations for each block.
+  llvm::DenseMap<const CFGBlock *, llvm::SmallVector<size_t>> BlockIters;
   // The messages logged in the current context but not yet written.
   std::string ContextLogs;
   // The number of elements we have visited within the current CFG block.
@@ -207,6 +208,7 @@ class HTMLLogger : public Logger {
           JOS->attribute("block", blockID(E.Block->getBlockID()));
           JOS->attribute("iter", E.Iter);
           JOS->attribute("post_visit", E.PostVisit);
+          JOS->attribute("converged", E.Converged);
         });
       }
     });
@@ -222,10 +224,10 @@ class HTMLLogger : public Logger {
   }
 
   void enterBlock(const CFGBlock &B, bool PostVisit) override {
-    llvm::SmallVector<Iteration> &BIter = BlockIters[&B];
+    llvm::SmallVector<size_t> &BIter = BlockIters[&B];
     unsigned IterNum = BIter.size() + 1;
-    BIter.push_back({&B, IterNum, PostVisit});
-    Iters.push_back({&B, IterNum, PostVisit});
+    BIter.push_back(Iters.size());
+    Iters.push_back({&B, IterNum, PostVisit, /*Converged=*/false});
     ElementIndex = 0;
   }
   void enterElement(const CFGElement &E) override {
@@ -290,7 +292,7 @@ class HTMLLogger : public Logger {
       }
     });
   }
-  void blockConverged() override { logText("Block converged"); }
+  void blockConverged() override { Iters.back().Converged = true; }
 
   void logText(llvm::StringRef S) override {
     ContextLogs.append(S.begin(), S.end());
@@ -301,13 +303,15 @@ class HTMLLogger : public Logger {
   // Write the CFG block details.
   // Currently this is just the list of elements in execution order.
   // FIXME: an AST dump would be a useful view, too.
-  void writeBlock(const CFGBlock &B, llvm::ArrayRef<Iteration> ItersForB) {
+  void writeBlock(const CFGBlock &B, llvm::ArrayRef<size_t> ItersForB) {
     JOS->attributeObject(blockID(B.getBlockID()), [&] {
       JOS->attributeArray("iters", [&] {
-        for (const auto &Iter : ItersForB) {
+        for (size_t IterIdx : ItersForB) {
+          const Iteration &Iter = Iters[IterIdx];
           JOS->object([&] {
             JOS->attribute("iter", Iter.Iter);
             JOS->attribute("post_visit", Iter.PostVisit);
+            JOS->attribute("converged", Iter.Converged);
           });
         }
       });
diff --git a/clang/lib/Analysis/FlowSensitive/HTMLLogger.html b/clang/lib/Analysis/FlowSensitive/HTMLLogger.html
index 6d866d57e14486..ec9d74c71d35d9 100644
--- a/clang/lib/Analysis/FlowSensitive/HTMLLogger.html
+++ b/clang/lib/Analysis/FlowSensitive/HTMLLogger.html
@@ -45,6 +45,7 @@
     {{entry.block}}
     <template data-if="entry.post_visit">(post-visit)</template>
     <template data-if="!entry.post_visit">({{entry.iter}})</template>
+    <template data-if="entry.converged"> &#x2192;&#x7c;<!--Rightwards arrow, vertical line--></template>
   </div>
 </template>
 </section>
@@ -62,6 +63,7 @@
     <a class="chooser {{selection.bb}}:{{iter.iter}}" data-iter="{{selection.bb}}:{{iter.iter}}">
       <template data-if="iter.post_visit">Post-visit</template>
       <template data-if="!iter.post_visit">Iteration {{iter.iter}}</template>
+      <template data-if="iter.converged"> &#x2192;&#x7c;<!--Rightwards arrow, vertical line--></template>
     </a>
   </template>
 </div>
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 6c92b847cd1d28..bb26e9f8dc8db8 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -525,7 +525,9 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       // The assignment operators are different from the type of the destination
       // in this model (i.e. in one of their base classes). This must be very
       // rare and we just bail.
-      if (Method->getThisObjectType().getCanonicalType().getUnqualifiedType() !=
+      if (Method->getFunctionObjectParameterType()
+              .getCanonicalType()
+              .getUnqualifiedType() !=
           LocDst->getType().getCanonicalType().getUnqualifiedType())
         return;
 
diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index 58dd7113665b13..54d0e95c6bd79a 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -1008,7 +1008,7 @@ class ThreadSafetyAnalyzer {
   threadSafety::SExprBuilder SxBuilder;
 
   ThreadSafetyHandler &Handler;
-  const CXXMethodDecl *CurrentMethod = nullptr;
+  const FunctionDecl *CurrentFunction;
   LocalVariableMap LocalVarMap;
   FactManager FactMan;
   std::vector<CFGBlockInfo> BlockInfo;
@@ -1243,10 +1243,10 @@ bool ThreadSafetyAnalyzer::inCurrentScope(const CapabilityExpr &CapE) {
 
   // Members are in scope from methods of the same class.
   if (const auto *P = dyn_cast<til::Project>(SExp)) {
-    if (!CurrentMethod)
+    if (!isa_and_nonnull<CXXMethodDecl>(CurrentFunction))
       return false;
     const ValueDecl *VD = P->clangDecl();
-    return VD->getDeclContext() == CurrentMethod->getDeclContext();
+    return VD->getDeclContext() == CurrentFunction->getDeclContext();
   }
 
   return false;
@@ -1541,6 +1541,8 @@ class BuildLockset : public ConstStmtVisitor<BuildLockset> {
 
   ThreadSafetyAnalyzer *Analyzer;
   FactSet FSet;
+  // The fact set for the function on exit.
+  const FactSet &FunctionExitFSet;
   /// Maps constructed objects to `this` placeholder prior to initialization.
   llvm::SmallDenseMap<const Expr *, til::LiteralPtr *> ConstructedObjects;
   LocalVariableMap::Context LVarCtx;
@@ -1566,9 +1568,11 @@ class BuildLockset : public ConstStmtVisitor<BuildLockset> {
                         bool SkipFirstParam = false);
 
 public:
-  BuildLockset(ThreadSafetyAnalyzer *Anlzr, CFGBlockInfo &Info)
+  BuildLockset(ThreadSafetyAnalyzer *Anlzr, CFGBlockInfo &Info,
+               const FactSet &FunctionExitFSet)
       : ConstStmtVisitor<BuildLockset>(), Analyzer(Anlzr), FSet(Info.EntrySet),
-        LVarCtx(Info.EntryContext), CtxIndex(Info.EntryIndex) {}
+        FunctionExitFSet(FunctionExitFSet), LVarCtx(Info.EntryContext),
+        CtxIndex(Info.EntryIndex) {}
 
   void VisitUnaryOperator(const UnaryOperator *UO);
   void VisitBinaryOperator(const BinaryOperator *BO);
@@ -1577,6 +1581,7 @@ class BuildLockset : public ConstStmtVisitor<BuildLockset> {
   void VisitCXXConstructExpr(const CXXConstructExpr *Exp);
   void VisitDeclStmt(const DeclStmt *S);
   void VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *Exp);
+  void VisitReturnStmt(const ReturnStmt *S);
 };
 
 } // namespace
@@ -1758,6 +1763,8 @@ void ThreadSafetyAnalyzer::checkPtAccess(const FactSet &FSet, const Expr *Exp,
   // Pass by reference warnings are under a different flag.
   ProtectedOperationKind PtPOK = POK_VarDereference;
   if (POK == POK_PassByRef) PtPOK = POK_PtPassByRef;
+  if (POK == POK_ReturnByRef)
+    PtPOK = POK_PtReturnByRef;
 
   const ValueDecl *D = getValueDecl(Exp);
   if (!D || !D->hasAttrs())
@@ -2142,6 +2149,25 @@ void BuildLockset::VisitMaterializeTemporaryExpr(
   }
 }
 
+void BuildLockset::VisitReturnStmt(const ReturnStmt *S) {
+  if (Analyzer->CurrentFunction == nullptr)
+    return;
+  const Expr *RetVal = S->getRetValue();
+  if (!RetVal)
+    return;
+
+  // If returning by reference, check that the function requires the appropriate
+  // capabilities.
+  const QualType ReturnType =
+      Analyzer->CurrentFunction->getReturnType().getCanonicalType();
+  if (ReturnType->isLValueReferenceType()) {
+    Analyzer->checkAccess(
+        FunctionExitFSet, RetVal,
+        ReturnType->getPointeeType().isConstQualified() ? AK_Read : AK_Written,
+        POK_ReturnByRef);
+  }
+}
+
 /// Given two facts merging on a join point, possibly warn and decide whether to
 /// keep or replace.
 ///
@@ -2251,8 +2277,7 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) {
 
   CFG *CFGraph = walker.getGraph();
   const NamedDecl *D = walker.getDecl();
-  const auto *CurrentFunction = dyn_cast<FunctionDecl>(D);
-  CurrentMethod = dyn_cast<CXXMethodDecl>(D);
+  CurrentFunction = dyn_cast<FunctionDecl>(D);
 
   if (D->hasAttr<NoThreadSafetyAnalysisAttr>())
     return;
@@ -2278,7 +2303,7 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) {
   PostOrderCFGView::CFGBlockSet VisitedBlocks(CFGraph);
 
   CFGBlockInfo &Initial = BlockInfo[CFGraph->getEntry().getBlockID()];
-  CFGBlockInfo &Final   = BlockInfo[CFGraph->getExit().getBlockID()];
+  CFGBlockInfo &Final = BlockInfo[CFGraph->getExit().getBlockID()];
 
   // Mark entry block as reachable
   Initial.Reachable = true;
@@ -2348,6 +2373,25 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) {
     }
   }
 
+  // Compute the expected exit set.
+  // By default, we expect all locks held on entry to be held on exit.
+  FactSet ExpectedFunctionExitSet = Initial.EntrySet;
+
+  // Adjust the expected exit set by adding or removing locks, as declared
+  // by *-LOCK_FUNCTION and UNLOCK_FUNCTION.  The intersect below will then
+  // issue the appropriate warning.
+  // FIXME: the location here is not quite right.
+  for (const auto &Lock : ExclusiveLocksAcquired)
+    ExpectedFunctionExitSet.addLock(
+        FactMan, std::make_unique<LockableFactEntry>(Lock, LK_Exclusive,
+                                                     D->getLocation()));
+  for (const auto &Lock : SharedLocksAcquired)
+    ExpectedFunctionExitSet.addLock(
+        FactMan,
+        std::make_unique<LockableFactEntry>(Lock, LK_Shared, D->getLocation()));
+  for (const auto &Lock : LocksReleased)
+    ExpectedFunctionExitSet.removeLock(FactMan, Lock);
+
   for (const auto *CurrBlock : *SortedGraph) {
     unsigned CurrBlockID = CurrBlock->getBlockID();
     CFGBlockInfo *CurrBlockInfo = &BlockInfo[CurrBlockID];
@@ -2407,7 +2451,7 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) {
     if (!CurrBlockInfo->Reachable)
       continue;
 
-    BuildLockset LocksetBuilder(this, *CurrBlockInfo);
+    BuildLockset LocksetBuilder(this, *CurrBlockInfo, ExpectedFunctionExitSet);
 
     // Visit all the statements in the basic block.
     for (const auto &BI : *CurrBlock) {
@@ -2483,24 +2527,8 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) {
   if (!Final.Reachable)
     return;
 
-  // By default, we expect all locks held on entry to be held on exit.
-  FactSet ExpectedExitSet = Initial.EntrySet;
-
-  // Adjust the expected exit set by adding or removing locks, as declared
-  // by *-LOCK_FUNCTION and UNLOCK_FUNCTION.  The intersect below will then
-  // issue the appropriate warning.
-  // FIXME: the location here is not quite right.
-  for (const auto &Lock : ExclusiveLocksAcquired)
-    ExpectedExitSet.addLock(FactMan, std::make_unique<LockableFactEntry>(
-                                         Lock, LK_Exclusive, D->getLocation()));
-  for (const auto &Lock : SharedLocksAcquired)
-    ExpectedExitSet.addLock(FactMan, std::make_unique<LockableFactEntry>(
-                                         Lock, LK_Shared, D->getLocation()));
-  for (const auto &Lock : LocksReleased)
-    ExpectedExitSet.removeLock(FactMan, Lock);
-
   // FIXME: Should we call this function for all blocks which exit the function?
-  intersectAndWarn(ExpectedExitSet, Final.ExitSet, Final.ExitLoc,
+  intersectAndWarn(ExpectedFunctionExitSet, Final.ExitSet, Final.ExitLoc,
                    LEK_LockedAtEndOfFunction, LEK_NotLockedAtEndOfFunction);
 
   Handler.leaveFunction(CurrentFunction);
diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp
index 63cc66852a9eb7..2fe0f85897c3bc 100644
--- a/clang/lib/Analysis/ThreadSafetyCommon.cpp
+++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp
@@ -154,7 +154,9 @@ CapabilityExpr SExprBuilder::translateAttrExpr(const Expr *AttrExp,
     // If the attribute has no arguments, then assume the argument is "this".
     if (!AttrExp)
       return CapabilityExpr(
-          Self, ClassifyDiagnostic(cast<CXXMethodDecl>(D)->getThisObjectType()),
+          Self,
+          ClassifyDiagnostic(
+              cast<CXXMethodDecl>(D)->getFunctionObjectParameterType()),
           false);
     else  // For most attributes.
       return translateAttrExpr(AttrExp, &Ctx);
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 7a54d27ef9d8ee..0208ccc31bd7fc 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -800,9 +800,10 @@ FormatDiagnostic(SmallVectorImpl<char> &OutStr) const {
   FormatDiagnostic(Diag.begin(), Diag.end(), OutStr);
 }
 
-/// pushEscapedString - Append Str to the diagnostic buffer,
+/// EscapeStringForDiagnostic - Append Str to the diagnostic buffer,
 /// escaping non-printable characters and ill-formed code unit sequences.
-static void pushEscapedString(StringRef Str, SmallVectorImpl<char> &OutStr) {
+void clang::EscapeStringForDiagnostic(StringRef Str,
+                                      SmallVectorImpl<char> &OutStr) {
   OutStr.reserve(OutStr.size() + Str.size());
   auto *Begin = reinterpret_cast<const unsigned char *>(Str.data());
   llvm::raw_svector_ostream OutStream(OutStr);
@@ -854,7 +855,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
       StringRef(DiagStr, DiagEnd - DiagStr).equals("%0") &&
       getArgKind(0) == DiagnosticsEngine::ak_std_string) {
     const std::string &S = getArgStdStr(0);
-    pushEscapedString(S, OutStr);
+    EscapeStringForDiagnostic(S, OutStr);
     return;
   }
 
@@ -961,7 +962,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
     case DiagnosticsEngine::ak_std_string: {
       const std::string &S = getArgStdStr(ArgNo);
       assert(ModifierLen == 0 && "No modifiers for strings yet");
-      pushEscapedString(S, OutStr);
+      EscapeStringForDiagnostic(S, OutStr);
       break;
     }
     case DiagnosticsEngine::ak_c_string: {
@@ -971,7 +972,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
       // Don't crash if get passed a null pointer by accident.
       if (!S)
         S = "(null)";
-      pushEscapedString(S, OutStr);
+      EscapeStringForDiagnostic(S, OutStr);
       break;
     }
     // ---- INTEGERS ----
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index 0455304ef7f2b1..7879a11179b6d4 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -299,8 +299,10 @@ bool Module::directlyUses(const Module *Requested) {
     if (Requested->isSubModuleOf(Use))
       return true;
 
-  // Anyone is allowed to use our builtin stddef.h and its accompanying module.
-  if (!Requested->Parent && Requested->Name == "_Builtin_stddef_max_align_t")
+  // Anyone is allowed to use our builtin stdarg.h and stddef.h and their
+  // accompanying modules.
+  if (Requested->getTopLevelModuleName() == "_Builtin_stdarg" ||
+      Requested->getTopLevelModuleName() == "_Builtin_stddef")
     return true;
 
   if (NoUndeclaredIncludes)
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 3066cc53dbfd87..7312f0514e72fd 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -527,17 +527,6 @@ FileID SourceManager::getNextFileID(FileID FID) const {
 
 /// Create a new FileID that represents the specified file
 /// being \#included from the specified IncludePosition.
-///
-/// This translates NULL into standard input.
-FileID SourceManager::createFileID(const FileEntry *SourceFile,
-                                   SourceLocation IncludePos,
-                                   SrcMgr::CharacteristicKind FileCharacter,
-                                   int LoadedID,
-                                   SourceLocation::UIntTy LoadedOffset) {
-  return createFileID(SourceFile->getLastRef(), IncludePos, FileCharacter,
-                      LoadedID, LoadedOffset);
-}
-
 FileID SourceManager::createFileID(FileEntryRef SourceFile,
                                    SourceLocation IncludePos,
                                    SrcMgr::CharacteristicKind FileCharacter,
@@ -585,7 +574,7 @@ FileID SourceManager::createFileID(const llvm::MemoryBufferRef &Buffer,
 /// Get the FileID for \p SourceFile if it exists. Otherwise, create a
 /// new FileID for the \p SourceFile.
 FileID
-SourceManager::getOrCreateFileID(const FileEntry *SourceFile,
+SourceManager::getOrCreateFileID(FileEntryRef SourceFile,
                                  SrcMgr::CharacteristicKind FileCharacter) {
   FileID ID = translateFile(SourceFile);
   return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(),
@@ -2375,8 +2364,9 @@ SourceManagerForFile::SourceManagerForFile(StringRef FileName,
       IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs),
       new DiagnosticOptions);
   SourceMgr = std::make_unique<SourceManager>(*Diagnostics, *FileMgr);
-  FileID ID = SourceMgr->createFileID(*FileMgr->getFile(FileName),
-                                      SourceLocation(), clang::SrcMgr::C_User);
+  FileEntryRef FE = llvm::cantFail(FileMgr->getFileRef(FileName));
+  FileID ID =
+      SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User);
   assert(ID.isValid());
   SourceMgr->setMainFileID(ID);
 }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index b0fe8e03aa0f5f..d066819871dfde 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -923,6 +923,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
   ModulePassManager MPM;
+  // Add a verifier pass, before any other passes, to catch CodeGen issues.
+  if (CodeGenOpts.VerifyModule)
+    MPM.addPass(VerifierPass());
 
   if (!CodeGenOpts.DisableLLVMPasses) {
     // Map our optimization levels into one of the distinct levels used to
@@ -1020,21 +1023,23 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (CodeGenOpts.FatLTO) {
-      MPM = PB.buildFatLTODefaultPipeline(Level, PrepareForThinLTO,
-                                          PrepareForThinLTO ||
-                                              shouldEmitRegularLTOSummary());
+      MPM.addPass(PB.buildFatLTODefaultPipeline(
+          Level, PrepareForThinLTO,
+          PrepareForThinLTO || shouldEmitRegularLTOSummary()));
     } else if (PrepareForThinLTO) {
-      MPM = PB.buildThinLTOPreLinkDefaultPipeline(Level);
+      MPM.addPass(PB.buildThinLTOPreLinkDefaultPipeline(Level));
     } else if (PrepareForLTO) {
-      MPM = PB.buildLTOPreLinkDefaultPipeline(Level);
+      MPM.addPass(PB.buildLTOPreLinkDefaultPipeline(Level));
     } else {
-      MPM = PB.buildPerModuleDefaultPipeline(Level);
+      MPM.addPass(PB.buildPerModuleDefaultPipeline(Level));
     }
   }
 
   // Add a verifier pass if requested. We don't have to do this if the action
   // requires code generation because there will already be a verifier pass in
   // the code-generation pipeline.
+  // Since we already added a verifier pass above, this
+  // might even not run the analysis, if previous passes caused no changes.
   if (!actionRequiresCodeGen(Action) && CodeGenOpts.VerifyModule)
     MPM.addPass(VerifierPass());
 
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index aa3e730d28efae..27f525ee35edbf 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1710,7 +1710,7 @@ static std::string getBlockCaptureStr(const CGBlockInfo::Capture &Cap,
     Str += "c";
     SmallString<256> TyStr;
     llvm::raw_svector_ostream Out(TyStr);
-    CGM.getCXXABI().getMangleContext().mangleTypeName(CaptureTy, Out);
+    CGM.getCXXABI().getMangleContext().mangleCanonicalTypeName(CaptureTy, Out);
     Str += llvm::to_string(TyStr.size()) + TyStr.c_str();
     break;
   }
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3f68aa2c953c74..bf984861bccb5c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -853,6 +853,57 @@ CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
     }
   }
 
+  if (IsDynamic) {
+    LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
+        getLangOpts().getStrictFlexArraysLevel();
+    const Expr *Base = E->IgnoreParenImpCasts();
+
+    if (FieldDecl *FD = FindCountedByField(Base, StrictFlexArraysLevel)) {
+      const auto *ME = dyn_cast<MemberExpr>(Base);
+      llvm::Value *ObjectSize = nullptr;
+
+      if (!ME) {
+        const auto *DRE = dyn_cast<DeclRefExpr>(Base);
+        ValueDecl *VD = nullptr;
+
+        ObjectSize = ConstantInt::get(
+            ResType,
+            getContext().getTypeSize(DRE->getType()->getPointeeType()) / 8,
+            true);
+
+        if (auto *RD = DRE->getType()->getPointeeType()->getAsRecordDecl())
+          VD = RD->getLastField();
+
+        Expr *ICE = ImplicitCastExpr::Create(
+            getContext(), DRE->getType(), CK_LValueToRValue,
+            const_cast<Expr *>(cast<Expr>(DRE)), nullptr, VK_PRValue,
+            FPOptionsOverride());
+        ME = MemberExpr::CreateImplicit(getContext(), ICE, true, VD,
+                                        VD->getType(), VK_LValue, OK_Ordinary);
+      }
+
+      // At this point, we know that \p ME is a flexible array member.
+      const auto *ArrayTy = getContext().getAsArrayType(ME->getType());
+      unsigned Size = getContext().getTypeSize(ArrayTy->getElementType());
+
+      llvm::Value *CountField =
+          EmitAnyExprToTemp(MemberExpr::CreateImplicit(
+                                getContext(), const_cast<Expr *>(ME->getBase()),
+                                ME->isArrow(), FD, FD->getType(), VK_LValue,
+                                OK_Ordinary))
+              .getScalarVal();
+
+      llvm::Value *Mul = Builder.CreateMul(
+          CountField, llvm::ConstantInt::get(CountField->getType(), Size / 8));
+      Mul = Builder.CreateZExtOrTrunc(Mul, ResType);
+
+      if (ObjectSize)
+        return Builder.CreateAdd(ObjectSize, Mul);
+
+      return Mul;
+    }
+  }
+
   // LLVM can't handle Type=3 appropriately, and __builtin_object_size shouldn't
   // evaluate E for side-effects. In either case, we shouldn't lower to
   // @llvm.objectsize.
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 3fe8d0a7cf49e1..93e16575042c4d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -298,7 +298,7 @@ CodeGenTypes::arrangeCXXMethodDeclaration(const CXXMethodDecl *MD) {
   setCUDAKernelCallingConvention(FT, CGM, MD);
   auto prototype = FT.getAs<FunctionProtoType>();
 
-  if (MD->isInstance()) {
+  if (MD->isImplicitObjectMemberFunction()) {
     // The abstract case is perfectly fine.
     const CXXRecordDecl *ThisType = TheCXXABI.getThisArgumentTypeForMethod(MD);
     return arrangeCXXMethodType(ThisType, prototype.getTypePtr(), MD);
@@ -448,7 +448,7 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args,
 const CGFunctionInfo &
 CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) {
   if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD))
-    if (MD->isInstance())
+    if (MD->isImplicitObjectMemberFunction())
       return arrangeCXXMethodDeclaration(MD);
 
   CanQualType FTy = FD->getType()->getCanonicalTypeUnqualified();
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index e7ed3ec431f75b..d18f186ce5b415 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -28,6 +28,7 @@
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Transforms/Utils/SanitizerStats.h"
 #include <optional>
 
@@ -138,7 +139,7 @@ Address CodeGenFunction::LoadCXXThisAddress() {
     CXXThisAlignment = CGM.getClassPointerAlignment(MD->getParent());
   }
 
-  llvm::Type *Ty = ConvertType(MD->getThisObjectType());
+  llvm::Type *Ty = ConvertType(MD->getFunctionObjectParameterType());
   return Address(LoadCXXThis(), Ty, CXXThisAlignment, KnownNonNull);
 }
 
@@ -510,7 +511,7 @@ namespace {
       const CXXDestructorDecl *D = BaseClass->getDestructor();
       // We are already inside a destructor, so presumably the object being
       // destroyed should have the expected type.
-      QualType ThisTy = D->getThisObjectType();
+      QualType ThisTy = D->getFunctionObjectParameterType();
       Address Addr =
         CGF.GetAddressOfDirectBaseInCompleteClass(CGF.LoadCXXThisAddress(),
                                                   DerivedClass, BaseClass,
@@ -1291,10 +1292,10 @@ void CodeGenFunction::EmitCtorPrologue(const CXXConstructorDecl *CD,
     assert(BaseCtorContinueBB);
   }
 
-  llvm::Value *const OldThis = CXXThisValue;
   for (; B != E && (*B)->isBaseInitializer() && (*B)->isBaseVirtual(); B++) {
     if (!ConstructVBases)
       continue;
+    SaveAndRestore ThisRAII(CXXThisValue);
     if (CGM.getCodeGenOpts().StrictVTablePointers &&
         CGM.getCodeGenOpts().OptimizationLevel > 0 &&
         isInitializerOfDynamicClass(*B))
@@ -1311,7 +1312,7 @@ void CodeGenFunction::EmitCtorPrologue(const CXXConstructorDecl *CD,
   // Then, non-virtual base initializers.
   for (; B != E && (*B)->isBaseInitializer(); B++) {
     assert(!(*B)->isBaseVirtual());
-
+    SaveAndRestore ThisRAII(CXXThisValue);
     if (CGM.getCodeGenOpts().StrictVTablePointers &&
         CGM.getCodeGenOpts().OptimizationLevel > 0 &&
         isInitializerOfDynamicClass(*B))
@@ -1319,8 +1320,6 @@ void CodeGenFunction::EmitCtorPrologue(const CXXConstructorDecl *CD,
     EmitBaseInitializer(*this, ClassDecl, *B);
   }
 
-  CXXThisValue = OldThis;
-
   InitializeVTablePointers(ClassDecl);
 
   // And finally, initialize class members.
@@ -1456,7 +1455,7 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) {
     RunCleanupsScope DtorEpilogue(*this);
     EnterDtorCleanups(Dtor, Dtor_Deleting);
     if (HaveInsertPoint()) {
-      QualType ThisTy = Dtor->getThisObjectType();
+      QualType ThisTy = Dtor->getFunctionObjectParameterType();
       EmitCXXDestructorCall(Dtor, Dtor_Complete, /*ForVirtualBase=*/false,
                             /*Delegating=*/false, LoadCXXThisAddress(), ThisTy);
     }
@@ -1490,7 +1489,7 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) {
     EnterDtorCleanups(Dtor, Dtor_Complete);
 
     if (!isTryBody) {
-      QualType ThisTy = Dtor->getThisObjectType();
+      QualType ThisTy = Dtor->getFunctionObjectParameterType();
       EmitCXXDestructorCall(Dtor, Dtor_Base, /*ForVirtualBase=*/false,
                             /*Delegating=*/false, LoadCXXThisAddress(), ThisTy);
       break;
@@ -2114,7 +2113,7 @@ void CodeGenFunction::EmitCXXConstructorCall(const CXXConstructorDecl *D,
   CallArgList Args;
   Address This = ThisAVS.getAddress();
   LangAS SlotAS = ThisAVS.getQualifiers().getAddressSpace();
-  LangAS ThisAS = D->getThisObjectType().getAddressSpace();
+  LangAS ThisAS = D->getFunctionObjectParameterType().getAddressSpace();
   llvm::Value *ThisPtr = This.getPointer();
 
   if (SlotAS != ThisAS) {
@@ -2453,7 +2452,7 @@ namespace {
     void Emit(CodeGenFunction &CGF, Flags flags) override {
       // We are calling the destructor from within the constructor.
       // Therefore, "this" should have the expected type.
-      QualType ThisTy = Dtor->getThisObjectType();
+      QualType ThisTy = Dtor->getFunctionObjectParameterType();
       CGF.EmitCXXDestructorCall(Dtor, Type, /*ForVirtualBase=*/false,
                                 /*Delegating=*/true, Addr, ThisTy);
     }
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 36e29285141b59..c73a63e12f03aa 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -2130,14 +2130,14 @@ CGDebugInfo::CollectTemplateParams(std::optional<TemplateArgs> OArgs,
       // attribute, i.e. that value is not available at the host side.
       if (!CGM.getLangOpts().CUDA || CGM.getLangOpts().CUDAIsDevice ||
           !D->hasAttr<CUDADeviceAttr>()) {
-        const CXXMethodDecl *MD;
         // Variable pointer template parameters have a value that is the address
         // of the variable.
         if (const auto *VD = dyn_cast<VarDecl>(D))
           V = CGM.GetAddrOfGlobalVar(VD);
         // Member function pointers have special support for building them,
         // though this is currently unsupported in LLVM CodeGen.
-        else if ((MD = dyn_cast<CXXMethodDecl>(D)) && MD->isInstance())
+        else if (const auto *MD = dyn_cast<CXXMethodDecl>(D);
+                 MD && MD->isImplicitObjectMemberFunction())
           V = CGM.getCXXABI().EmitMemberFunctionPointer(MD);
         else if (const auto *FD = dyn_cast<FunctionDecl>(D))
           V = CGM.GetAddrOfFunction(FD);
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index 8a05e4893196ad..30683ad474cd2c 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -666,13 +666,13 @@ void CodeGenModule::EmitCXXModuleInitFunc(Module *Primary) {
   // Module initializers for imported modules are emitted first.
 
   // Collect all the modules that we import
-  SmallVector<Module *> AllImports;
+  llvm::SmallSetVector<Module *, 8> AllImports;
   // Ones that we export
   for (auto I : Primary->Exports)
-    AllImports.push_back(I.getPointer());
+    AllImports.insert(I.getPointer());
   // Ones that we only import.
   for (Module *M : Primary->Imports)
-    AllImports.push_back(M);
+    AllImports.insert(M);
   // Ones that we import in the global module fragment or the private module
   // fragment.
   llvm::for_each(Primary->submodules(), [&AllImports](Module *SubM) {
@@ -683,7 +683,7 @@ void CodeGenModule::EmitCXXModuleInitFunc(Module *Primary) {
            "The global mdoule fragments and the private module fragments are "
            "not allowed to export import modules.");
     for (Module *M : SubM->Imports)
-      AllImports.push_back(M);
+      AllImports.insert(M);
   });
 
   SmallVector<llvm::Function *, 8> ModuleInits;
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 7b4389c874b3f9..54a1d300a9ac73 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -30,6 +30,7 @@
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/SourceManager.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Intrinsics.h"
@@ -931,16 +932,31 @@ static llvm::Value *getArrayIndexingBound(CodeGenFunction &CGF,
     if (CE->getCastKind() == CK_ArrayToPointerDecay &&
         !CE->getSubExpr()->isFlexibleArrayMemberLike(CGF.getContext(),
                                                      StrictFlexArraysLevel)) {
+      CodeGenFunction::SanitizerScope SanScope(&CGF);
+
       IndexedType = CE->getSubExpr()->getType();
       const ArrayType *AT = IndexedType->castAsArrayTypeUnsafe();
       if (const auto *CAT = dyn_cast<ConstantArrayType>(AT))
         return CGF.Builder.getInt(CAT->getSize());
-      else if (const auto *VAT = dyn_cast<VariableArrayType>(AT))
+
+      if (const auto *VAT = dyn_cast<VariableArrayType>(AT))
         return CGF.getVLASize(VAT).NumElts;
       // Ignore pass_object_size here. It's not applicable on decayed pointers.
     }
+
+    if (FieldDecl *FD = CGF.FindCountedByField(Base, StrictFlexArraysLevel)) {
+      const auto *ME = dyn_cast<MemberExpr>(CE->getSubExpr());
+      IndexedType = Base->getType();
+      return CGF
+          .EmitAnyExprToTemp(MemberExpr::CreateImplicit(
+              CGF.getContext(), const_cast<Expr *>(ME->getBase()),
+              ME->isArrow(), FD, FD->getType(), VK_LValue, OK_Ordinary))
+          .getScalarVal();
+    }
   }
 
+  CodeGenFunction::SanitizerScope SanScope(&CGF);
+
   QualType EltTy{Base->getType()->getPointeeOrArrayElementType(), 0};
   if (llvm::Value *POS = CGF.LoadPassedObjectSize(Base, EltTy)) {
     IndexedType = Base->getType();
@@ -950,13 +966,53 @@ static llvm::Value *getArrayIndexingBound(CodeGenFunction &CGF,
   return nullptr;
 }
 
+FieldDecl *CodeGenFunction::FindCountedByField(
+    const Expr *Base,
+    LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel) {
+  const ValueDecl *VD = nullptr;
+
+  Base = Base->IgnoreParenImpCasts();
+
+  if (const auto *ME = dyn_cast<MemberExpr>(Base)) {
+    VD = dyn_cast<ValueDecl>(ME->getMemberDecl());
+  } else if (const auto *DRE = dyn_cast<DeclRefExpr>(Base)) {
+    // Pointing to the full structure.
+    VD = dyn_cast<ValueDecl>(DRE->getDecl());
+
+    QualType Ty = VD->getType();
+    if (Ty->isPointerType())
+      Ty = Ty->getPointeeType();
+
+    if (const auto *RD = Ty->getAsRecordDecl())
+      VD = RD->getLastField();
+  } else if (const auto *CE = dyn_cast<CastExpr>(Base)) {
+    if (const auto *ME = dyn_cast<MemberExpr>(CE->getSubExpr()))
+      VD = dyn_cast<ValueDecl>(ME->getMemberDecl());
+  }
+
+  const auto *FD = dyn_cast_if_present<FieldDecl>(VD);
+  if (!FD || !FD->getParent() ||
+      !Decl::isFlexibleArrayMemberLike(getContext(), FD, FD->getType(),
+                                       StrictFlexArraysLevel, true))
+    return nullptr;
+
+  const auto *CBA = FD->getAttr<CountedByAttr>();
+  if (!CBA)
+    return nullptr;
+
+  StringRef FieldName = CBA->getCountedByField()->getName();
+  auto It =
+      llvm::find_if(FD->getParent()->fields(), [&](const FieldDecl *Field) {
+        return FieldName == Field->getName();
+      });
+  return It != FD->getParent()->field_end() ? *It : nullptr;
+}
+
 void CodeGenFunction::EmitBoundsCheck(const Expr *E, const Expr *Base,
                                       llvm::Value *Index, QualType IndexType,
                                       bool Accessed) {
   assert(SanOpts.has(SanitizerKind::ArrayBounds) &&
          "should not be called unless adding bounds checks");
-  SanitizerScope SanScope(this);
-
   const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
     getLangOpts().getStrictFlexArraysLevel();
 
@@ -966,6 +1022,8 @@ void CodeGenFunction::EmitBoundsCheck(const Expr *E, const Expr *Base,
   if (!Bound)
     return;
 
+  SanitizerScope SanScope(this);
+
   bool IndexSigned = IndexType->isSignedIntegerOrEnumerationType();
   llvm::Value *IndexVal = Builder.CreateIntCast(Index, SizeTy, IndexSigned);
   llvm::Value *BoundVal = Builder.CreateIntCast(Bound, SizeTy, false);
@@ -2647,9 +2705,8 @@ static LValue EmitFunctionDeclLValue(CodeGenFunction &CGF, const Expr *E,
 
 static LValue EmitCapturedFieldLValue(CodeGenFunction &CGF, const FieldDecl *FD,
                                       llvm::Value *ThisValue) {
-  QualType TagType = CGF.getContext().getTagDeclType(FD->getParent());
-  LValue LV = CGF.MakeNaturalAlignAddrLValue(ThisValue, TagType);
-  return CGF.EmitLValueForField(LV, FD);
+
+  return CGF.EmitLValueForLambdaField(FD, ThisValue);
 }
 
 /// Named Registers are named metadata pointing to the register name
@@ -4262,17 +4319,38 @@ LValue CodeGenFunction::EmitMemberExpr(const MemberExpr *E) {
 
 /// Given that we are currently emitting a lambda, emit an l-value for
 /// one of its members.
-LValue CodeGenFunction::EmitLValueForLambdaField(const FieldDecl *Field) {
-  if (CurCodeDecl) {
-    assert(cast<CXXMethodDecl>(CurCodeDecl)->getParent()->isLambda());
-    assert(cast<CXXMethodDecl>(CurCodeDecl)->getParent() == Field->getParent());
+///
+LValue CodeGenFunction::EmitLValueForLambdaField(const FieldDecl *Field,
+                                                 llvm::Value *ThisValue) {
+  bool HasExplicitObjectParameter = false;
+  if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(CurCodeDecl)) {
+    HasExplicitObjectParameter = MD->isExplicitObjectMemberFunction();
+    assert(MD->getParent()->isLambda());
+    assert(MD->getParent() == Field->getParent());
+  }
+  LValue LambdaLV;
+  if (HasExplicitObjectParameter) {
+    const VarDecl *D = cast<CXXMethodDecl>(CurCodeDecl)->getParamDecl(0);
+    auto It = LocalDeclMap.find(D);
+    assert(It != LocalDeclMap.end() && "explicit parameter not loaded?");
+    Address AddrOfExplicitObject = It->getSecond();
+    if (D->getType()->isReferenceType())
+      LambdaLV = EmitLoadOfReferenceLValue(AddrOfExplicitObject, D->getType(),
+                                           AlignmentSource::Decl);
+    else
+      LambdaLV = MakeNaturalAlignAddrLValue(AddrOfExplicitObject.getPointer(),
+                                            D->getType().getNonReferenceType());
+  } else {
+    QualType LambdaTagType = getContext().getTagDeclType(Field->getParent());
+    LambdaLV = MakeNaturalAlignAddrLValue(ThisValue, LambdaTagType);
   }
-  QualType LambdaTagType =
-    getContext().getTagDeclType(Field->getParent());
-  LValue LambdaLV = MakeNaturalAlignAddrLValue(CXXABIThisValue, LambdaTagType);
   return EmitLValueForField(LambdaLV, Field);
 }
 
+LValue CodeGenFunction::EmitLValueForLambdaField(const FieldDecl *Field) {
+  return EmitLValueForLambdaField(Field, CXXABIThisValue);
+}
+
 /// Get the field index in the debug info. The debug info structure/union
 /// will ignore the unnamed bitfields.
 unsigned CodeGenFunction::getDebugInfoFIndex(const RecordDecl *Rec,
@@ -4987,9 +5065,12 @@ RValue CodeGenFunction::EmitCallExpr(const CallExpr *E,
   if (const auto *CE = dyn_cast<CUDAKernelCallExpr>(E))
     return EmitCUDAKernelCallExpr(CE, ReturnValue);
 
+  // A CXXOperatorCallExpr is created even for explicit object methods, but
+  // these should be treated like static function call.
   if (const auto *CE = dyn_cast<CXXOperatorCallExpr>(E))
-    if (const CXXMethodDecl *MD =
-          dyn_cast_or_null<CXXMethodDecl>(CE->getCalleeDecl()))
+    if (const auto *MD =
+            dyn_cast_if_present<CXXMethodDecl>(CE->getCalleeDecl());
+        MD && MD->isImplicitObjectMemberFunction())
       return EmitCXXOperatorMemberCallExpr(CE, MD, ReturnValue);
 
   CGCallee callee = EmitCallee(E->getCallee());
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 34d4f37a5d295f..2e7059cc8f5b63 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -41,7 +41,7 @@ commonEmitCXXMemberOrOperatorCall(CodeGenFunction &CGF, GlobalDecl GD,
 
   assert(CE == nullptr || isa<CXXMemberCallExpr>(CE) ||
          isa<CXXOperatorCallExpr>(CE));
-  assert(MD->isInstance() &&
+  assert(MD->isImplicitObjectMemberFunction() &&
          "Trying to emit a member or operator call expr on a static method!");
 
   // Push the this ptr.
@@ -66,7 +66,12 @@ commonEmitCXXMemberOrOperatorCall(CodeGenFunction &CGF, GlobalDecl GD,
     Args.addFrom(*RtlArgs);
   } else if (CE) {
     // Special case: skip first argument of CXXOperatorCall (it is "this").
-    unsigned ArgsToSkip = isa<CXXOperatorCallExpr>(CE) ? 1 : 0;
+    unsigned ArgsToSkip = 0;
+    if (const auto *Op = dyn_cast<CXXOperatorCallExpr>(CE)) {
+      if (const auto *M = dyn_cast<CXXMethodDecl>(Op->getCalleeDecl()))
+        ArgsToSkip =
+            static_cast<unsigned>(!M->isExplicitObjectMemberFunction());
+    }
     CGF.EmitCallArgs(Args, FPT, drop_begin(CE->arguments(), ArgsToSkip),
                      CE->getDirectCallee());
   } else {
@@ -484,7 +489,7 @@ RValue
 CodeGenFunction::EmitCXXOperatorMemberCallExpr(const CXXOperatorCallExpr *E,
                                                const CXXMethodDecl *MD,
                                                ReturnValueSlot ReturnValue) {
-  assert(MD->isInstance() &&
+  assert(MD->isImplicitObjectMemberFunction() &&
          "Trying to emit a member call expr on a static method!");
   return EmitCXXMemberOrOperatorMemberCallExpr(
       E, MD, ReturnValue, /*HasQualifier=*/false, /*Qualifier=*/nullptr,
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index d76ce15b41ac57..93ab064bdf3915 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -3723,10 +3723,12 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
   // Explicitly handle GNU void* and function pointer arithmetic extensions. The
   // GNU void* casts amount to no-ops since our void* type is i8*, but this is
   // future proof.
+  llvm::Type *elemTy;
   if (elementType->isVoidType() || elementType->isFunctionType())
-    return CGF.Builder.CreateGEP(CGF.Int8Ty, pointer, index, "add.ptr");
+    elemTy = CGF.Int8Ty;
+  else
+    elemTy = CGF.ConvertTypeForMem(elementType);
 
-  llvm::Type *elemTy = CGF.ConvertTypeForMem(elementType);
   if (CGF.getLangOpts().isSignedOverflowDefined())
     return CGF.Builder.CreateGEP(elemTy, pointer, index, "add.ptr");
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index b4e482ca5cba6e..aae1a0ea250eea 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -7734,30 +7734,6 @@ class MappableExprsHandler {
            OpenMPOffloadMappingFlags::OMP_MAP_FROM;
   }
 
-  static OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position) {
-    // Rotate by getFlagMemberOffset() bits.
-    return static_cast<OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
-                                                  << getFlagMemberOffset());
-  }
-
-  static void setCorrectMemberOfFlag(OpenMPOffloadMappingFlags &Flags,
-                                     OpenMPOffloadMappingFlags MemberOfFlag) {
-    // If the entry is PTR_AND_OBJ but has not been marked with the special
-    // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
-    // marked as MEMBER_OF.
-    if (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
-            Flags & OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ) &&
-        static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
-            (Flags & OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF) !=
-            OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF))
-      return;
-
-    // Reset the placeholder value to prepare the flag for the assignment of the
-    // proper MEMBER_OF value.
-    Flags &= ~OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
-    Flags |= MemberOfFlag;
-  }
-
   void getPlainLayout(const CXXRecordDecl *RD,
                       llvm::SmallVectorImpl<const FieldDecl *> &Layout,
                       bool AsBase) const {
@@ -7825,6 +7801,7 @@ class MappableExprsHandler {
   /// the device pointers info array.
   void generateAllInfoForClauses(
       ArrayRef<const OMPClause *> Clauses, MapCombinedInfoTy &CombinedInfo,
+      llvm::OpenMPIRBuilder &OMPBuilder,
       const llvm::DenseSet<CanonicalDeclPtr<const Decl>> &SkipVarSet =
           llvm::DenseSet<CanonicalDeclPtr<const Decl>>()) const {
     // We have to process the component lists that relate with the same
@@ -8159,7 +8136,7 @@ class MappableExprsHandler {
       if (PartialStruct.Base.isValid()) {
         CurInfo.NonContigInfo.Dims.push_back(0);
         emitCombinedEntry(CombinedInfo, CurInfo.Types, PartialStruct,
-                          /*IsMapThis*/ !VD, VD);
+                          /*IsMapThis*/ !VD, OMPBuilder, VD);
       }
 
       // We need to append the results of this capture to what we already
@@ -8226,6 +8203,7 @@ class MappableExprsHandler {
   void emitCombinedEntry(MapCombinedInfoTy &CombinedInfo,
                          MapFlagsArrayTy &CurTypes,
                          const StructRangeInfoTy &PartialStruct, bool IsMapThis,
+                         llvm::OpenMPIRBuilder &OMPBuilder,
                          const ValueDecl *VD = nullptr,
                          bool NotTargetParams = true) const {
     if (CurTypes.size() == 1 &&
@@ -8260,7 +8238,7 @@ class MappableExprsHandler {
       // of tofrom.
       // Emit this[:1]
       CombinedInfo.Pointers.push_back(PartialStruct.Base.getPointer());
-      QualType Ty = MD->getThisObjectType();
+      QualType Ty = MD->getFunctionObjectParameterType();
       llvm::Value *Size =
           CGF.Builder.CreateIntCast(CGF.getTypeSize(Ty), CGF.Int64Ty,
                                     /*isSigned=*/true);
@@ -8313,9 +8291,9 @@ class MappableExprsHandler {
     // (except for PTR_AND_OBJ entries which do not have a placeholder value
     // 0xFFFF in the MEMBER_OF field).
     OpenMPOffloadMappingFlags MemberOfFlag =
-        getMemberOfFlag(CombinedInfo.BasePointers.size() - 1);
+        OMPBuilder.getMemberOfFlag(CombinedInfo.BasePointers.size() - 1);
     for (auto &M : CurTypes)
-      setCorrectMemberOfFlag(M, MemberOfFlag);
+      OMPBuilder.setCorrectMemberOfFlag(M, MemberOfFlag);
   }
 
   /// Generate all the base pointers, section pointers, sizes, map types, and
@@ -8324,23 +8302,26 @@ class MappableExprsHandler {
   /// pair of the relevant declaration and index where it occurs is appended to
   /// the device pointers info array.
   void generateAllInfo(
-      MapCombinedInfoTy &CombinedInfo,
+      MapCombinedInfoTy &CombinedInfo, llvm::OpenMPIRBuilder &OMPBuilder,
       const llvm::DenseSet<CanonicalDeclPtr<const Decl>> &SkipVarSet =
           llvm::DenseSet<CanonicalDeclPtr<const Decl>>()) const {
     assert(CurDir.is<const OMPExecutableDirective *>() &&
            "Expect a executable directive");
     const auto *CurExecDir = CurDir.get<const OMPExecutableDirective *>();
-    generateAllInfoForClauses(CurExecDir->clauses(), CombinedInfo, SkipVarSet);
+    generateAllInfoForClauses(CurExecDir->clauses(), CombinedInfo, OMPBuilder,
+                              SkipVarSet);
   }
 
   /// Generate all the base pointers, section pointers, sizes, map types, and
   /// mappers for the extracted map clauses of user-defined mapper (all included
   /// in \a CombinedInfo).
-  void generateAllInfoForMapper(MapCombinedInfoTy &CombinedInfo) const {
+  void generateAllInfoForMapper(MapCombinedInfoTy &CombinedInfo,
+                                llvm::OpenMPIRBuilder &OMPBuilder) const {
     assert(CurDir.is<const OMPDeclareMapperDecl *>() &&
            "Expect a declare mapper directive");
     const auto *CurMapperDir = CurDir.get<const OMPDeclareMapperDecl *>();
-    generateAllInfoForClauses(CurMapperDir->clauses(), CombinedInfo);
+    generateAllInfoForClauses(CurMapperDir->clauses(), CombinedInfo,
+                              OMPBuilder);
   }
 
   /// Emit capture info for lambdas for variables captured by reference.
@@ -8422,6 +8403,7 @@ class MappableExprsHandler {
 
   /// Set correct indices for lambdas captures.
   void adjustMemberOfForLambdaCaptures(
+      llvm::OpenMPIRBuilder &OMPBuilder,
       const llvm::DenseMap<llvm::Value *, llvm::Value *> &LambdaPointers,
       MapBaseValuesArrayTy &BasePointers, MapValuesArrayTy &Pointers,
       MapFlagsArrayTy &Types) const {
@@ -8446,8 +8428,9 @@ class MappableExprsHandler {
       // All other current entries will be MEMBER_OF the combined entry
       // (except for PTR_AND_OBJ entries which do not have a placeholder value
       // 0xFFFF in the MEMBER_OF field).
-      OpenMPOffloadMappingFlags MemberOfFlag = getMemberOfFlag(TgtIdx);
-      setCorrectMemberOfFlag(Types[I], MemberOfFlag);
+      OpenMPOffloadMappingFlags MemberOfFlag =
+          OMPBuilder.getMemberOfFlag(TgtIdx);
+      OMPBuilder.setCorrectMemberOfFlag(Types[I], MemberOfFlag);
     }
   }
 
@@ -9069,7 +9052,7 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D,
   llvm::FunctionType *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
   SmallString<64> TyStr;
   llvm::raw_svector_ostream Out(TyStr);
-  CGM.getCXXABI().getMangleContext().mangleTypeName(Ty, Out);
+  CGM.getCXXABI().getMangleContext().mangleCanonicalTypeName(Ty, Out);
   std::string Name = getName({"omp_mapper", TyStr, D->getName()});
   auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
                                     Name, &CGM.getModule());
@@ -9141,7 +9124,7 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D,
   // Get map clause information. Fill up the arrays with all mapped variables.
   MappableExprsHandler::MapCombinedInfoTy Info;
   MappableExprsHandler MEHandler(*D, MapperCGF);
-  MEHandler.generateAllInfoForMapper(Info);
+  MEHandler.generateAllInfoForMapper(Info, OMPBuilder);
 
   // Call the runtime API __tgt_mapper_num_components to get the number of
   // pre-existing components.
@@ -9525,7 +9508,8 @@ static void emitTargetCallKernelLaunch(
       CombinedInfo.append(PartialStruct.PreliminaryMapData);
       MEHandler.emitCombinedEntry(
           CombinedInfo, CurInfo.Types, PartialStruct, CI->capturesThis(),
-          nullptr, !PartialStruct.PreliminaryMapData.BasePointers.empty());
+          OMPBuilder, nullptr,
+          !PartialStruct.PreliminaryMapData.BasePointers.empty());
     }
 
     // We need to append the results of this capture to what we already have.
@@ -9533,11 +9517,11 @@ static void emitTargetCallKernelLaunch(
   }
   // Adjust MEMBER_OF flags for the lambdas captures.
   MEHandler.adjustMemberOfForLambdaCaptures(
-      LambdaPointers, CombinedInfo.BasePointers, CombinedInfo.Pointers,
-      CombinedInfo.Types);
+      OMPBuilder, LambdaPointers, CombinedInfo.BasePointers,
+      CombinedInfo.Pointers, CombinedInfo.Types);
   // Map any list items in a map clause that were not captures because they
   // weren't referenced within the construct.
-  MEHandler.generateAllInfo(CombinedInfo, MappedVarSet);
+  MEHandler.generateAllInfo(CombinedInfo, OMPBuilder, MappedVarSet);
 
   CGOpenMPRuntime::TargetDataInfo Info;
   // Fill up the arrays and create the arguments.
@@ -10272,7 +10256,7 @@ void CGOpenMPRuntime::emitTargetDataCalls(
     CGF.Builder.restoreIP(CodeGenIP);
     // Get map clause information.
     MappableExprsHandler MEHandler(D, CGF);
-    MEHandler.generateAllInfo(CombinedInfo);
+    MEHandler.generateAllInfo(CombinedInfo, OMPBuilder);
 
     auto FillInfoMap = [&](MappableExprsHandler::MappingExprInfo &MapExpr) {
       return emitMappingInformation(CGF, OMPBuilder, MapExpr);
@@ -10478,7 +10462,7 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
 
     // Get map clause information.
     MappableExprsHandler MEHandler(D, CGF);
-    MEHandler.generateAllInfo(CombinedInfo);
+    MEHandler.generateAllInfo(CombinedInfo, OMPBuilder);
 
     CGOpenMPRuntime::TargetDataInfo Info;
     // Fill up the arrays and create the arguments.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 93819ab815add0..56de8d39fe856d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -551,10 +551,9 @@ CGOpenMPRuntimeGPU::getExecutionMode() const {
   return CurrentExecutionMode;
 }
 
-static CGOpenMPRuntimeGPU::DataSharingMode
-getDataSharingMode(CodeGenModule &CGM) {
-  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
-                                          : CGOpenMPRuntimeGPU::Generic;
+CGOpenMPRuntimeGPU::DataSharingMode
+CGOpenMPRuntimeGPU::getDataSharingMode() const {
+  return CurrentDataSharingMode;
 }
 
 /// Check for inner (nested) SPMD construct, if any
@@ -752,6 +751,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
   EntryFunctionState EST;
   WrapperFunctionsMap.clear();
 
+  [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
+  assert(!IsBareKernel && "bare kernel should not be at generic mode");
+
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
@@ -760,15 +762,13 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
         : EST(EST) {}
     void Enter(CodeGenFunction &CGF) override {
-      auto &RT =
-          static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+      auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
       RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
       // Skip target region initialization.
       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
     }
     void Exit(CodeGenFunction &CGF) override {
-      auto &RT =
-          static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+      auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
       RT.clearLocThreadIdInsertPt(CGF);
       RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
     }
@@ -807,25 +807,39 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
   EntryFunctionState EST;
 
+  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
+
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
     CGOpenMPRuntimeGPU &RT;
     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
+    bool IsBareKernel;
+    DataSharingMode Mode;
 
   public:
     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
-                         CGOpenMPRuntimeGPU::EntryFunctionState &EST)
-        : RT(RT), EST(EST) {}
+                         CGOpenMPRuntimeGPU::EntryFunctionState &EST,
+                         bool IsBareKernel)
+        : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
+          Mode(RT.CurrentDataSharingMode) {}
     void Enter(CodeGenFunction &CGF) override {
+      if (IsBareKernel) {
+        RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
+        return;
+      }
       RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
       // Skip target region initialization.
       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
     }
     void Exit(CodeGenFunction &CGF) override {
+      if (IsBareKernel) {
+        RT.CurrentDataSharingMode = Mode;
+        return;
+      }
       RT.clearLocThreadIdInsertPt(CGF);
       RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
     }
-  } Action(*this, EST);
+  } Action(*this, EST, IsBareKernel);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
@@ -843,7 +857,8 @@ void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
   assert(!ParentName.empty() && "Invalid target region parent name!");
 
   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
-  if (Mode)
+  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
+  if (Mode || IsBareKernel)
     emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
                    CodeGen);
   else
@@ -863,6 +878,9 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
   if (!CGM.getLangOpts().OpenMPIsTargetDevice)
     llvm_unreachable("OpenMP can only handle device code.");
 
+  if (CGM.getLangOpts().OpenMPCUDAMode)
+    CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
+
   llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
   if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
     return;
@@ -1030,7 +1048,7 @@ llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
                                                  SourceLocation Loc,
                                                  bool WithSPMDCheck) {
-  if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
+  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic &&
       getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
     return;
 
@@ -1142,7 +1160,7 @@ void CGOpenMPRuntimeGPU::getKmpcFreeShared(
 
 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
                                                  bool WithSPMDCheck) {
-  if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
+  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic &&
       getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
     return;
 
@@ -1178,11 +1196,18 @@ void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
   if (!CGF.HaveInsertPoint())
     return;
 
+  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
+
   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
                                                       /*Name=*/".zero.addr");
   CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
-  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
+  // We don't emit any thread id function call in bare kernel, but because the
+  // outlined function has a pointer argument, we emit a nullptr here.
+  if (IsBareKernel)
+    OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
+  else
+    OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
   OutlinedFnArgs.push_back(ZeroAddr.getPointer());
   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
   emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
@@ -3273,7 +3298,7 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
 
 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
                                               const Decl *D) {
-  if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
+  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
     return;
 
   assert(D && "Expected function or captured|block decl.");
@@ -3382,7 +3407,7 @@ Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
         VarTy, Align);
   }
 
-  if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
+  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
     return Address::invalid();
 
   VD = VD->getCanonicalDecl();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index dddfe5a94dccb8..86871dfce418fd 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -32,6 +32,18 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
     /// Unknown execution mode (orphaned directive).
     EM_Unknown,
   };
+
+  /// Target codegen is specialized based on two data-sharing modes: CUDA, in
+  /// which the local variables are actually global threadlocal, and Generic, in
+  /// which the local variables are placed in global memory if they may escape
+  /// their declaration context.
+  enum DataSharingMode {
+    /// CUDA data sharing mode.
+    DS_CUDA,
+    /// Generic data-sharing mode.
+    DS_Generic,
+  };
+
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -42,6 +54,8 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
 
   ExecutionMode getExecutionMode() const;
 
+  DataSharingMode getDataSharingMode() const;
+
   /// Get barrier to synchronize all threads in a block.
   void syncCTAThreads(CodeGenFunction &CGF);
 
@@ -297,17 +311,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
   Address getAddressOfLocalVariable(CodeGenFunction &CGF,
                                     const VarDecl *VD) override;
 
-  /// Target codegen is specialized based on two data-sharing modes: CUDA, in
-  /// which the local variables are actually global threadlocal, and Generic, in
-  /// which the local variables are placed in global memory if they may escape
-  /// their declaration context.
-  enum DataSharingMode {
-    /// CUDA data sharing mode.
-    CUDA,
-    /// Generic data-sharing mode.
-    Generic,
-  };
-
   /// Cleans up references to the objects in finished function.
   ///
   void functionFinished(CodeGenFunction &CGF) override;
@@ -343,6 +346,10 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
   /// to emit optimized code.
   ExecutionMode CurrentExecutionMode = EM_Unknown;
 
+  /// Track the data sharing mode when codegening directives within a target
+  /// region.
+  DataSharingMode CurrentDataSharingMode = DataSharingMode::DS_Generic;
+
   /// true if currently emitting code for target/teams/distribute region, false
   /// - otherwise.
   bool IsInTTDRegion = false;
diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 23cfcdd138439f..25e4b1c2793202 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <cstdio>
+#include <utility>
 
 using namespace clang;
 using namespace CodeGen;
@@ -201,7 +202,7 @@ CodeGenFunction::GenerateVarArgsThunk(llvm::Function *Fn,
   // Find the first store of "this", which will be to the alloca associated
   // with "this".
   Address ThisPtr =
-      Address(&*AI, ConvertTypeForMem(MD->getThisObjectType()),
+      Address(&*AI, ConvertTypeForMem(MD->getFunctionObjectParameterType()),
               CGM.getClassPointerAlignment(MD->getParent()));
   llvm::BasicBlock *EntryBB = &Fn->front();
   llvm::BasicBlock::iterator ThisStore =
@@ -639,8 +640,16 @@ void CodeGenVTables::addRelativeComponent(ConstantArrayBuilder &builder,
   // want the stub/proxy to be emitted for properly calculating the offset.
   // Examples where there would be no symbol emitted are available_externally
   // and private linkages.
-  auto stubLinkage = vtableHasLocalLinkage ? llvm::GlobalValue::InternalLinkage
-                                           : llvm::GlobalValue::ExternalLinkage;
+  //
+  // `internal` linkage results in STB_LOCAL Elf binding while still manifesting a
+  // local symbol.
+  //
+  // `linkonce_odr` linkage results in a STB_DEFAULT Elf binding but also allows for
+  // the rtti_proxy to be transparently replaced with a GOTPCREL reloc by a
+  // target that supports this replacement.
+  auto stubLinkage = vtableHasLocalLinkage
+                         ? llvm::GlobalValue::InternalLinkage
+                         : llvm::GlobalValue::LinkOnceODRLinkage;
 
   llvm::Constant *target;
   if (auto *func = dyn_cast<llvm::Function>(globalVal)) {
@@ -1308,43 +1317,34 @@ void CodeGenModule::EmitVTableTypeMetadata(const CXXRecordDecl *RD,
 
   CharUnits ComponentWidth = GetTargetTypeStoreSize(getVTableComponentType());
 
-  typedef std::pair<const CXXRecordDecl *, unsigned> AddressPoint;
+  struct AddressPoint {
+    const CXXRecordDecl *Base;
+    size_t Offset;
+    std::string TypeName;
+    bool operator<(const AddressPoint &RHS) const {
+      int D = TypeName.compare(RHS.TypeName);
+      return D < 0 || (D == 0 && Offset < RHS.Offset);
+    }
+  };
   std::vector<AddressPoint> AddressPoints;
-  for (auto &&AP : VTLayout.getAddressPoints())
-    AddressPoints.push_back(std::make_pair(
-        AP.first.getBase(), VTLayout.getVTableOffset(AP.second.VTableIndex) +
-                                AP.second.AddressPointIndex));
+  for (auto &&AP : VTLayout.getAddressPoints()) {
+    AddressPoint N{AP.first.getBase(),
+                   VTLayout.getVTableOffset(AP.second.VTableIndex) +
+                       AP.second.AddressPointIndex,
+                   {}};
+    llvm::raw_string_ostream Stream(N.TypeName);
+    getCXXABI().getMangleContext().mangleCanonicalTypeName(
+        QualType(N.Base->getTypeForDecl(), 0), Stream);
+    AddressPoints.push_back(std::move(N));
+  }
 
   // Sort the address points for determinism.
-  llvm::sort(AddressPoints, [this](const AddressPoint &AP1,
-                                   const AddressPoint &AP2) {
-    if (&AP1 == &AP2)
-      return false;
-
-    std::string S1;
-    llvm::raw_string_ostream O1(S1);
-    getCXXABI().getMangleContext().mangleTypeName(
-        QualType(AP1.first->getTypeForDecl(), 0), O1);
-    O1.flush();
-
-    std::string S2;
-    llvm::raw_string_ostream O2(S2);
-    getCXXABI().getMangleContext().mangleTypeName(
-        QualType(AP2.first->getTypeForDecl(), 0), O2);
-    O2.flush();
-
-    if (S1 < S2)
-      return true;
-    if (S1 != S2)
-      return false;
-
-    return AP1.second < AP2.second;
-  });
+  llvm::sort(AddressPoints);
 
   ArrayRef<VTableComponent> Comps = VTLayout.vtable_components();
   for (auto AP : AddressPoints) {
     // Create type metadata for the address point.
-    AddVTableTypeMetadata(VTable, ComponentWidth * AP.second, AP.first);
+    AddVTableTypeMetadata(VTable, ComponentWidth * AP.Offset, AP.Base);
 
     // The class associated with each address point could also potentially be
     // used for indirect calls via a member function pointer, so we need to
@@ -1356,7 +1356,7 @@ void CodeGenModule::EmitVTableTypeMetadata(const CXXRecordDecl *RD,
       llvm::Metadata *MD = CreateMetadataIdentifierForVirtualMemPtrType(
           Context.getMemberPointerType(
               Comps[I].getFunctionDecl()->getType(),
-              Context.getRecordType(AP.first).getTypePtr()));
+              Context.getRecordType(AP.Base).getTypePtr()));
       VTable->addTypeMetadata((ComponentWidth * I).getQuantity(), MD);
     }
   }
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index bf83171e2c6814..9b21f428b0af7f 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -576,7 +576,7 @@ CodeGenFunction::getUBSanFunctionTypeHash(QualType Ty) const {
     Ty = getContext().getFunctionTypeWithExceptionSpec(Ty, EST_None);
   std::string Mangled;
   llvm::raw_string_ostream Out(Mangled);
-  CGM.getCXXABI().getMangleContext().mangleTypeName(Ty, Out, false);
+  CGM.getCXXABI().getMangleContext().mangleCanonicalTypeName(Ty, Out, false);
   return llvm::ConstantInt::get(
       CGM.Int32Ty, static_cast<uint32_t>(llvm::xxh3_64bits(Mangled)));
 }
@@ -1167,12 +1167,13 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
 
   EmitFunctionProlog(*CurFnInfo, CurFn, Args);
 
-  if (isa_and_nonnull<CXXMethodDecl>(D) &&
-      cast<CXXMethodDecl>(D)->isInstance()) {
-    CGM.getCXXABI().EmitInstanceFunctionProlog(*this);
-    const CXXMethodDecl *MD = cast<CXXMethodDecl>(D);
-    if (MD->getParent()->isLambda() &&
-        MD->getOverloadedOperator() == OO_Call) {
+  if (const CXXMethodDecl *MD = dyn_cast_if_present<CXXMethodDecl>(D);
+      MD && !MD->isStatic()) {
+    bool IsInLambda =
+        MD->getParent()->isLambda() && MD->getOverloadedOperator() == OO_Call;
+    if (MD->isImplicitObjectMemberFunction())
+      CGM.getCXXABI().EmitInstanceFunctionProlog(*this);
+    if (IsInLambda) {
       // We're in a lambda; figure out the captures.
       MD->getParent()->getCaptureFields(LambdaCaptureFields,
                                         LambdaThisCaptureField);
@@ -1202,7 +1203,7 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
           VLASizeMap[VAT->getSizeExpr()] = ExprArg;
         }
       }
-    } else {
+    } else if (MD->isImplicitObjectMemberFunction()) {
       // Not in a lambda; just use 'this' from the method.
       // FIXME: Should we generate a new load for each use of 'this'?  The
       // fast register allocator would be happier...
@@ -1313,7 +1314,7 @@ QualType CodeGenFunction::BuildFunctionArgList(GlobalDecl GD,
   QualType ResTy = FD->getReturnType();
 
   const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD);
-  if (MD && MD->isInstance()) {
+  if (MD && MD->isImplicitObjectMemberFunction()) {
     if (CGM.getCXXABI().HasThisReturn(GD))
       ResTy = MD->getThisType();
     else if (CGM.getCXXABI().hasMostDerivedReturn(GD))
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 60f2f21de53ab9..d5336382a2b9c9 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3022,6 +3022,12 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitBoundsCheck(const Expr *E, const Expr *Base, llvm::Value *Index,
                        QualType IndexType, bool Accessed);
 
+  /// Find the FieldDecl specified in a FAM's "counted_by" attribute. Returns
+  /// \p nullptr if either the attribute or the field doesn't exist.
+  FieldDecl *FindCountedByField(
+      const Expr *Base,
+      LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel);
+
   llvm::Value *EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
                                        bool isInc, bool isPre);
   ComplexPairTy EmitComplexPrePostIncDec(const UnaryOperator *E, LValue LV,
@@ -4017,6 +4023,8 @@ class CodeGenFunction : public CodeGenTypeCache {
                                            const ObjCIvarDecl *Ivar);
   LValue EmitLValueForField(LValue Base, const FieldDecl* Field);
   LValue EmitLValueForLambdaField(const FieldDecl *Field);
+  LValue EmitLValueForLambdaField(const FieldDecl *Field,
+                                  llvm::Value *ThisValue);
 
   /// EmitLValueForFieldInitialization - Like EmitLValueForField, except that
   /// if the Field is a reference, this will return the address of the reference
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c96523c5256258..cae9dd93bc5592 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1390,6 +1390,19 @@ void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV,
   }
   if (!D)
     return;
+
+  // OpenMP declare target variables must be visible to the host so they can
+  // be registered. We require protected visibility unless the variable has
+  // the DT_nohost modifier and does not need to be registered.
+  if (Context.getLangOpts().OpenMP &&
+      Context.getLangOpts().OpenMPIsTargetDevice && isa<VarDecl>(D) &&
+      D->hasAttr<OMPDeclareTargetDeclAttr>() &&
+      D->getAttr<OMPDeclareTargetDeclAttr>()->getDevType() !=
+          OMPDeclareTargetDeclAttr::DT_NoHost) {
+    GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
+    return;
+  }
+
   // Set visibility for definitions, and for declarations if requested globally
   // or set explicitly.
   LinkageInfo LV = D->getLinkageAndVisibility();
@@ -2006,7 +2019,7 @@ llvm::ConstantInt *CodeGenModule::CreateKCFITypeId(QualType T) {
 
   std::string OutName;
   llvm::raw_string_ostream Out(OutName);
-  getCXXABI().getMangleContext().mangleTypeName(
+  getCXXABI().getMangleContext().mangleCanonicalTypeName(
       T, Out, getCodeGenOpts().SanitizeCfiICallNormalizeIntegers);
 
   if (getCodeGenOpts().SanitizeCfiICallNormalizeIntegers)
@@ -2246,8 +2259,8 @@ static bool requiresMemberFunctionPointerTypeMetadata(CodeGenModule &CGM,
 
   // Only functions whose address can be taken with a member function pointer
   // need this sort of type metadata.
-  return !MD->isStatic() && !MD->isVirtual() && !isa<CXXConstructorDecl>(MD) &&
-         !isa<CXXDestructorDecl>(MD);
+  return MD->isImplicitObjectMemberFunction() && !MD->isVirtual() &&
+         !isa<CXXConstructorDecl, CXXDestructorDecl>(MD);
 }
 
 SmallVector<const CXXRecordDecl *, 0>
@@ -2938,6 +2951,9 @@ void CodeGenModule::EmitModuleInitializers(clang::Module *Primary) {
   // Third any associated with the Privat eMOdule Fragment, if present.
   if (auto PMF = Primary->getPrivateModuleFragment()) {
     for (Decl *D : getContext().getModuleInitializers(PMF)) {
+      // Skip import decls, the inits for those are called explicitly.
+      if (isa<ImportDecl>(D))
+        continue;
       assert(isa<VarDecl>(D) && "PMF initializer decl is not a var?");
       EmitTopLevelDecl(D);
     }
@@ -4787,7 +4803,8 @@ CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty,
   assert(getContext().getTargetAddressSpace(ExpectedAS) == TargetAS);
   if (DAddrSpace != ExpectedAS) {
     return getTargetCodeGenInfo().performAddrSpaceCast(
-        *this, GV, DAddrSpace, ExpectedAS, Ty->getPointerTo(TargetAS));
+        *this, GV, DAddrSpace, ExpectedAS,
+        llvm::PointerType::get(getLLVMContext(), TargetAS));
   }
 
   return GV;
@@ -4999,7 +5016,8 @@ castStringLiteralToDefaultAddressSpace(CodeGenModule &CGM,
     if (AS != LangAS::Default)
       Cast = CGM.getTargetCodeGenInfo().performAddrSpaceCast(
           CGM, GV, AS, LangAS::Default,
-          GV->getValueType()->getPointerTo(
+          llvm::PointerType::get(
+              CGM.getLLVMContext(),
               CGM.getContext().getTargetAddressSpace(LangAS::Default)));
   }
   return Cast;
@@ -6374,7 +6392,8 @@ ConstantAddress CodeGenModule::GetAddrOfGlobalTemporary(
   if (AddrSpace != LangAS::Default)
     CV = getTargetCodeGenInfo().performAddrSpaceCast(
         *this, GV, AddrSpace, LangAS::Default,
-        Type->getPointerTo(
+        llvm::PointerType::get(
+            getLLVMContext(),
             getContext().getTargetAddressSpace(LangAS::Default)));
 
   // Update the map with the new temporary. If we created a placeholder above,
@@ -7200,7 +7219,7 @@ CodeGenModule::CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
   if (isExternallyVisible(T->getLinkage())) {
     std::string OutName;
     llvm::raw_string_ostream Out(OutName);
-    getCXXABI().getMangleContext().mangleTypeName(
+    getCXXABI().getMangleContext().mangleCanonicalTypeName(
         T, Out, getCodeGenOpts().SanitizeCfiICallNormalizeIntegers);
 
     if (getCodeGenOpts().SanitizeCfiICallNormalizeIntegers)
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 395ed7b1d703a5..8705d3d65f1a57 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -205,7 +205,7 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
 
     SmallString<256> OutName;
     llvm::raw_svector_ostream Out(OutName);
-    MContext.mangleTypeName(QualType(ETy, 0), Out);
+    MContext.mangleCanonicalTypeName(QualType(ETy, 0), Out);
     return createScalarTypeNode(OutName, getChar(), Size);
   }
 
@@ -391,7 +391,7 @@ llvm::MDNode *CodeGenTBAA::getBaseTypeInfoHelper(const Type *Ty) {
     if (Features.CPlusPlus) {
       // Don't use the mangler for C code.
       llvm::raw_svector_ostream Out(OutName);
-      MContext.mangleTypeName(QualType(Ty, 0), Out);
+      MContext.mangleCanonicalTypeName(QualType(Ty, 0), Out);
     } else {
       OutName = RD->getName();
     }
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index dc628b7345f59f..f6a614b3e4d54d 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -308,12 +308,13 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
     return false;
 
-  return D->hasAttr<OpenCLKernelAttr>() ||
-         (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
-         (isa<VarDecl>(D) &&
-          (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
-           cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
-           cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()));
+  return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
+         (D->hasAttr<OpenCLKernelAttr>() ||
+          (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
+          (isa<VarDecl>(D) &&
+           (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
+            cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
+            cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
 }
 
 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 84b8fc7685fed4..77328e1f99e502 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -767,7 +767,8 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                    [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
                      return types::isHIP(I.first);
                    }) ||
-      C.getInputArgs().hasArg(options::OPT_hip_link);
+      C.getInputArgs().hasArg(options::OPT_hip_link) ||
+      C.getInputArgs().hasArg(options::OPT_hipstdpar);
   if (IsCuda && IsHIP) {
     Diag(clang::diag::err_drv_mix_cuda_hip);
     return;
@@ -2705,6 +2706,10 @@ void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args,
         }
       }
 
+      if ((Ty == types::TY_C || Ty == types::TY_CXX) &&
+          Args.hasArgNoClaim(options::OPT_hipstdpar))
+        Ty = types::TY_HIP;
+
       if (DiagnoseInputExistence(Args, Value, Ty, /*TypoCorrect=*/true))
         Inputs.push_back(std::make_pair(Ty, A));
 
@@ -3915,6 +3920,11 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args,
   phases::ID FinalPhase = getFinalPhase(Args, &FinalPhaseArg);
 
   if (FinalPhase == phases::Link) {
+    if (Args.hasArgNoClaim(options::OPT_hipstdpar)) {
+      Args.AddFlagArg(nullptr, getOpts().getOption(options::OPT_hip_link));
+      Args.AddFlagArg(nullptr,
+                      getOpts().getOption(options::OPT_frtlib_add_rpath));
+    }
     // Emitting LLVM while linking disabled except in HIPAMD Toolchain
     if (Args.hasArg(options::OPT_emit_llvm) && !Args.hasArg(options::OPT_hip_link))
       Diag(clang::diag::err_drv_emit_llvm_link);
diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp
index d11c41605bf39e..0ad9de299603ba 100644
--- a/clang/lib/Driver/OffloadBundler.cpp
+++ b/clang/lib/Driver/OffloadBundler.cpp
@@ -21,24 +21,29 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
@@ -48,6 +53,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <forward_list>
+#include <llvm/Support/Process.h>
 #include <memory>
 #include <set>
 #include <string>
@@ -58,6 +64,10 @@ using namespace llvm;
 using namespace llvm::object;
 using namespace clang;
 
+static llvm::TimerGroup
+    ClangOffloadBundlerTimerGroup("Clang Offload Bundler Timer Group",
+                                  "Timer group for clang offload bundler");
+
 /// Magic string that marks the existence of offloading data.
 #define OFFLOAD_BUNDLER_MAGIC_STR "__CLANG_OFFLOAD_BUNDLE__"
 
@@ -224,20 +234,22 @@ class FileHandler {
 
   /// Write the header of the bundled file to \a OS based on the information
   /// gathered from \a Inputs.
-  virtual Error WriteHeader(raw_fd_ostream &OS,
+  virtual Error WriteHeader(raw_ostream &OS,
                             ArrayRef<std::unique_ptr<MemoryBuffer>> Inputs) = 0;
 
   /// Write the marker that initiates a bundle for the triple \a TargetTriple to
   /// \a OS.
-  virtual Error WriteBundleStart(raw_fd_ostream &OS,
-                                 StringRef TargetTriple) = 0;
+  virtual Error WriteBundleStart(raw_ostream &OS, StringRef TargetTriple) = 0;
 
   /// Write the marker that closes a bundle for the triple \a TargetTriple to \a
   /// OS.
-  virtual Error WriteBundleEnd(raw_fd_ostream &OS, StringRef TargetTriple) = 0;
+  virtual Error WriteBundleEnd(raw_ostream &OS, StringRef TargetTriple) = 0;
 
   /// Write the bundle from \a Input into \a OS.
-  virtual Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) = 0;
+  virtual Error WriteBundle(raw_ostream &OS, MemoryBuffer &Input) = 0;
+
+  /// Finalize output file.
+  virtual Error finalizeOutputFile() { return Error::success(); }
 
   /// List bundle IDs in \a Input.
   virtual Error listBundleIDs(MemoryBuffer &Input) {
@@ -311,7 +323,7 @@ static uint64_t Read8byteIntegerFromBuffer(StringRef Buffer, size_t pos) {
 }
 
 /// Write 8-byte integers to a buffer in little-endian format.
-static void Write8byteIntegerToBuffer(raw_fd_ostream &OS, uint64_t Val) {
+static void Write8byteIntegerToBuffer(raw_ostream &OS, uint64_t Val) {
   llvm::support::endian::write(OS, Val, llvm::support::little);
 }
 
@@ -359,8 +371,7 @@ class BinaryFileHandler final : public FileHandler {
       return Error::success();
 
     // Check if no magic was found.
-    StringRef Magic(FC.data(), sizeof(OFFLOAD_BUNDLER_MAGIC_STR) - 1);
-    if (!Magic.equals(OFFLOAD_BUNDLER_MAGIC_STR))
+    if (llvm::identify_magic(FC) != llvm::file_magic::offload_bundle)
       return Error::success();
 
     // Read number of bundles.
@@ -435,7 +446,7 @@ class BinaryFileHandler final : public FileHandler {
     return Error::success();
   }
 
-  Error WriteHeader(raw_fd_ostream &OS,
+  Error WriteHeader(raw_ostream &OS,
                     ArrayRef<std::unique_ptr<MemoryBuffer>> Inputs) final {
 
     // Compute size of the header.
@@ -472,19 +483,27 @@ class BinaryFileHandler final : public FileHandler {
     return Error::success();
   }
 
-  Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final {
+  Error WriteBundleStart(raw_ostream &OS, StringRef TargetTriple) final {
     CurWriteBundleTarget = TargetTriple.str();
     return Error::success();
   }
 
-  Error WriteBundleEnd(raw_fd_ostream &OS, StringRef TargetTriple) final {
+  Error WriteBundleEnd(raw_ostream &OS, StringRef TargetTriple) final {
     return Error::success();
   }
 
-  Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final {
+  Error WriteBundle(raw_ostream &OS, MemoryBuffer &Input) final {
     auto BI = BundlesInfo[CurWriteBundleTarget];
-    OS.seek(BI.Offset);
+
+    // Pad with 0 to reach specified offset.
+    size_t CurrentPos = OS.tell();
+    size_t PaddingSize = BI.Offset > CurrentPos ? BI.Offset - CurrentPos : 0;
+    for (size_t I = 0; I < PaddingSize; ++I)
+      OS.write('\0');
+    assert(OS.tell() == BI.Offset);
+
     OS.write(Input.getBufferStart(), Input.getBufferSize());
+
     return Error::success();
   }
 };
@@ -541,7 +560,7 @@ class ObjectFileHandler final : public FileHandler {
       return NameOrErr.takeError();
 
     // If it does not start with the reserved suffix, just skip this section.
-    if (!NameOrErr->startswith(OFFLOAD_BUNDLER_MAGIC_STR))
+    if (llvm::identify_magic(*NameOrErr) != llvm::file_magic::offload_bundle)
       return std::nullopt;
 
     // Return the triple that is right after the reserved prefix.
@@ -607,7 +626,7 @@ class ObjectFileHandler final : public FileHandler {
     return Error::success();
   }
 
-  Error WriteHeader(raw_fd_ostream &OS,
+  Error WriteHeader(raw_ostream &OS,
                     ArrayRef<std::unique_ptr<MemoryBuffer>> Inputs) final {
     assert(BundlerConfig.HostInputIndex != ~0u &&
            "Host input index not defined.");
@@ -617,12 +636,16 @@ class ObjectFileHandler final : public FileHandler {
     return Error::success();
   }
 
-  Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final {
+  Error WriteBundleStart(raw_ostream &OS, StringRef TargetTriple) final {
     ++NumberOfProcessedInputs;
     return Error::success();
   }
 
-  Error WriteBundleEnd(raw_fd_ostream &OS, StringRef TargetTriple) final {
+  Error WriteBundleEnd(raw_ostream &OS, StringRef TargetTriple) final {
+    return Error::success();
+  }
+
+  Error finalizeOutputFile() final {
     assert(NumberOfProcessedInputs <= NumberOfInputs &&
            "Processing more inputs that actually exist!");
     assert(BundlerConfig.HostInputIndex != ~0u &&
@@ -640,10 +663,6 @@ class ObjectFileHandler final : public FileHandler {
     assert(BundlerConfig.ObjcopyPath != "" &&
            "llvm-objcopy path not specified");
 
-    // We write to the output file directly. So, we close it and use the name
-    // to pass down to llvm-objcopy.
-    OS.close();
-
     // Temporary files that need to be removed.
     TempFileHandlerRAII TempFiles;
 
@@ -684,7 +703,7 @@ class ObjectFileHandler final : public FileHandler {
     return Error::success();
   }
 
-  Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final {
+  Error WriteBundle(raw_ostream &OS, MemoryBuffer &Input) final {
     return Error::success();
   }
 
@@ -781,22 +800,22 @@ class TextFileHandler final : public FileHandler {
     return Error::success();
   }
 
-  Error WriteHeader(raw_fd_ostream &OS,
+  Error WriteHeader(raw_ostream &OS,
                     ArrayRef<std::unique_ptr<MemoryBuffer>> Inputs) final {
     return Error::success();
   }
 
-  Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final {
+  Error WriteBundleStart(raw_ostream &OS, StringRef TargetTriple) final {
     OS << BundleStartString << TargetTriple << "\n";
     return Error::success();
   }
 
-  Error WriteBundleEnd(raw_fd_ostream &OS, StringRef TargetTriple) final {
+  Error WriteBundleEnd(raw_ostream &OS, StringRef TargetTriple) final {
     OS << BundleEndString << TargetTriple << "\n";
     return Error::success();
   }
 
-  Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final {
+  Error WriteBundle(raw_ostream &OS, MemoryBuffer &Input) final {
     OS << Input.getBuffer();
     return Error::success();
   }
@@ -881,6 +900,187 @@ CreateFileHandler(MemoryBuffer &FirstInput,
                            "'" + FilesType + "': invalid file type specified");
 }
 
+OffloadBundlerConfig::OffloadBundlerConfig() {
+  auto IgnoreEnvVarOpt =
+      llvm::sys::Process::GetEnv("OFFLOAD_BUNDLER_IGNORE_ENV_VAR");
+  if (IgnoreEnvVarOpt.has_value() && IgnoreEnvVarOpt.value() == "1")
+    return;
+
+  auto VerboseEnvVarOpt = llvm::sys::Process::GetEnv("OFFLOAD_BUNDLER_VERBOSE");
+  if (VerboseEnvVarOpt.has_value())
+    Verbose = VerboseEnvVarOpt.value() == "1";
+
+  auto CompressEnvVarOpt =
+      llvm::sys::Process::GetEnv("OFFLOAD_BUNDLER_COMPRESS");
+  if (CompressEnvVarOpt.has_value())
+    Compress = CompressEnvVarOpt.value() == "1";
+}
+
+llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
+CompressedOffloadBundle::compress(const llvm::MemoryBuffer &Input,
+                                  bool Verbose) {
+  llvm::Timer HashTimer("Hash Calculation Timer", "Hash calculation time",
+                        ClangOffloadBundlerTimerGroup);
+  if (Verbose)
+    HashTimer.startTimer();
+  llvm::MD5 Hash;
+  llvm::MD5::MD5Result Result;
+  Hash.update(Input.getBuffer());
+  Hash.final(Result);
+  uint64_t TruncatedHash = Result.low();
+  if (Verbose)
+    HashTimer.stopTimer();
+
+  SmallVector<uint8_t, 0> CompressedBuffer;
+  auto BufferUint8 = llvm::ArrayRef<uint8_t>(
+      reinterpret_cast<const uint8_t *>(Input.getBuffer().data()),
+      Input.getBuffer().size());
+
+  llvm::compression::Format CompressionFormat;
+
+  if (llvm::compression::zstd::isAvailable())
+    CompressionFormat = llvm::compression::Format::Zstd;
+  else if (llvm::compression::zlib::isAvailable())
+    CompressionFormat = llvm::compression::Format::Zlib;
+  else
+    return createStringError(llvm::inconvertibleErrorCode(),
+                             "Compression not supported");
+
+  llvm::Timer CompressTimer("Compression Timer", "Compression time",
+                            ClangOffloadBundlerTimerGroup);
+  if (Verbose)
+    CompressTimer.startTimer();
+  llvm::compression::compress(CompressionFormat, BufferUint8, CompressedBuffer);
+  if (Verbose)
+    CompressTimer.stopTimer();
+
+  uint16_t CompressionMethod = static_cast<uint16_t>(CompressionFormat);
+  uint32_t UncompressedSize = Input.getBuffer().size();
+
+  SmallVector<char, 0> FinalBuffer;
+  llvm::raw_svector_ostream OS(FinalBuffer);
+  OS << MagicNumber;
+  OS.write(reinterpret_cast<const char *>(&Version), sizeof(Version));
+  OS.write(reinterpret_cast<const char *>(&CompressionMethod),
+           sizeof(CompressionMethod));
+  OS.write(reinterpret_cast<const char *>(&UncompressedSize),
+           sizeof(UncompressedSize));
+  OS.write(reinterpret_cast<const char *>(&TruncatedHash),
+           sizeof(TruncatedHash));
+  OS.write(reinterpret_cast<const char *>(CompressedBuffer.data()),
+           CompressedBuffer.size());
+
+  if (Verbose) {
+    auto MethodUsed =
+        CompressionFormat == llvm::compression::Format::Zstd ? "zstd" : "zlib";
+    llvm::errs() << "Compressed bundle format version: " << Version << "\n"
+                 << "Compression method used: " << MethodUsed << "\n"
+                 << "Binary size before compression: " << UncompressedSize
+                 << " bytes\n"
+                 << "Binary size after compression: " << CompressedBuffer.size()
+                 << " bytes\n"
+                 << "Truncated MD5 hash: "
+                 << llvm::format_hex(TruncatedHash, 16) << "\n";
+  }
+
+  return llvm::MemoryBuffer::getMemBufferCopy(
+      llvm::StringRef(FinalBuffer.data(), FinalBuffer.size()));
+}
+
+llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
+CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input,
+                                    bool Verbose) {
+
+  StringRef Blob = Input.getBuffer();
+
+  if (Blob.size() < HeaderSize) {
+    return llvm::MemoryBuffer::getMemBufferCopy(Blob);
+  }
+  if (llvm::identify_magic(Blob) !=
+      llvm::file_magic::offload_bundle_compressed) {
+    if (Verbose)
+      llvm::errs() << "Uncompressed bundle.\n";
+    return llvm::MemoryBuffer::getMemBufferCopy(Blob);
+  }
+
+  uint16_t ThisVersion;
+  uint16_t CompressionMethod;
+  uint32_t UncompressedSize;
+  uint64_t StoredHash;
+  memcpy(&ThisVersion, Input.getBuffer().data() + MagicNumber.size(),
+         sizeof(uint16_t));
+  memcpy(&CompressionMethod, Blob.data() + MagicSize + VersionFieldSize,
+         sizeof(uint16_t));
+  memcpy(&UncompressedSize,
+         Blob.data() + MagicSize + VersionFieldSize + MethodFieldSize,
+         sizeof(uint32_t));
+  memcpy(&StoredHash,
+         Blob.data() + MagicSize + VersionFieldSize + MethodFieldSize +
+             SizeFieldSize,
+         sizeof(uint64_t));
+
+  llvm::compression::Format CompressionFormat;
+  if (CompressionMethod ==
+      static_cast<uint16_t>(llvm::compression::Format::Zlib))
+    CompressionFormat = llvm::compression::Format::Zlib;
+  else if (CompressionMethod ==
+           static_cast<uint16_t>(llvm::compression::Format::Zstd))
+    CompressionFormat = llvm::compression::Format::Zstd;
+  else
+    return createStringError(inconvertibleErrorCode(),
+                             "Unknown compressing method");
+
+  llvm::Timer DecompressTimer("Decompression Timer", "Decompression time",
+                              ClangOffloadBundlerTimerGroup);
+  if (Verbose)
+    DecompressTimer.startTimer();
+
+  SmallVector<uint8_t, 0> DecompressedData;
+  StringRef CompressedData = Blob.substr(HeaderSize);
+  if (llvm::Error DecompressionError = llvm::compression::decompress(
+          CompressionFormat, llvm::arrayRefFromStringRef(CompressedData),
+          DecompressedData, UncompressedSize))
+    return createStringError(inconvertibleErrorCode(),
+                             "Could not decompress embedded file contents: " +
+                                 llvm::toString(std::move(DecompressionError)));
+
+  if (Verbose) {
+    DecompressTimer.stopTimer();
+
+    // Recalculate MD5 hash
+    llvm::Timer HashRecalcTimer("Hash Recalculation Timer",
+                                "Hash recalculation time",
+                                ClangOffloadBundlerTimerGroup);
+    HashRecalcTimer.startTimer();
+    llvm::MD5 Hash;
+    llvm::MD5::MD5Result Result;
+    Hash.update(llvm::ArrayRef<uint8_t>(DecompressedData.data(),
+                                        DecompressedData.size()));
+    Hash.final(Result);
+    uint64_t RecalculatedHash = Result.low();
+    HashRecalcTimer.stopTimer();
+    bool HashMatch = (StoredHash == RecalculatedHash);
+
+    llvm::errs() << "Compressed bundle format version: " << ThisVersion << "\n"
+                 << "Decompression method: "
+                 << (CompressionFormat == llvm::compression::Format::Zlib
+                         ? "zlib"
+                         : "zstd")
+                 << "\n"
+                 << "Size before decompression: " << CompressedData.size()
+                 << " bytes\n"
+                 << "Size after decompression: " << UncompressedSize
+                 << " bytes\n"
+                 << "Stored hash: " << llvm::format_hex(StoredHash, 16) << "\n"
+                 << "Recalculated hash: "
+                 << llvm::format_hex(RecalculatedHash, 16) << "\n"
+                 << "Hashes match: " << (HashMatch ? "Yes" : "No") << "\n";
+  }
+
+  return llvm::MemoryBuffer::getMemBufferCopy(
+      llvm::toStringRef(DecompressedData));
+}
+
 // List bundle IDs. Return true if an error was found.
 Error OffloadBundler::ListBundleIDsInFile(
     StringRef InputFileName, const OffloadBundlerConfig &BundlerConfig) {
@@ -890,28 +1090,35 @@ Error OffloadBundler::ListBundleIDsInFile(
   if (std::error_code EC = CodeOrErr.getError())
     return createFileError(InputFileName, EC);
 
-  MemoryBuffer &Input = **CodeOrErr;
+  // Decompress the input if necessary.
+  Expected<std::unique_ptr<MemoryBuffer>> DecompressedBufferOrErr =
+      CompressedOffloadBundle::decompress(**CodeOrErr, BundlerConfig.Verbose);
+  if (!DecompressedBufferOrErr)
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to decompress input: " +
+            llvm::toString(DecompressedBufferOrErr.takeError()));
+
+  MemoryBuffer &DecompressedInput = **DecompressedBufferOrErr;
 
   // Select the right files handler.
   Expected<std::unique_ptr<FileHandler>> FileHandlerOrErr =
-      CreateFileHandler(Input, BundlerConfig);
+      CreateFileHandler(DecompressedInput, BundlerConfig);
   if (!FileHandlerOrErr)
     return FileHandlerOrErr.takeError();
 
   std::unique_ptr<FileHandler> &FH = *FileHandlerOrErr;
   assert(FH);
-  return FH->listBundleIDs(Input);
+  return FH->listBundleIDs(DecompressedInput);
 }
 
 /// Bundle the files. Return true if an error was found.
 Error OffloadBundler::BundleFiles() {
   std::error_code EC;
 
-  // Create output file.
-  raw_fd_ostream OutputFile(BundlerConfig.OutputFileNames.front(), EC,
-                            sys::fs::OF_None);
-  if (EC)
-    return createFileError(BundlerConfig.OutputFileNames.front(), EC);
+  // Create a buffer to hold the content before compressing.
+  SmallVector<char, 0> Buffer;
+  llvm::raw_svector_ostream BufferStream(Buffer);
 
   // Open input files.
   SmallVector<std::unique_ptr<MemoryBuffer>, 8u> InputBuffers;
@@ -938,22 +1145,46 @@ Error OffloadBundler::BundleFiles() {
   assert(FH);
 
   // Write header.
-  if (Error Err = FH->WriteHeader(OutputFile, InputBuffers))
+  if (Error Err = FH->WriteHeader(BufferStream, InputBuffers))
     return Err;
 
   // Write all bundles along with the start/end markers. If an error was found
   // writing the end of the bundle component, abort the bundle writing.
   auto Input = InputBuffers.begin();
   for (auto &Triple : BundlerConfig.TargetNames) {
-    if (Error Err = FH->WriteBundleStart(OutputFile, Triple))
+    if (Error Err = FH->WriteBundleStart(BufferStream, Triple))
       return Err;
-    if (Error Err = FH->WriteBundle(OutputFile, **Input))
+    if (Error Err = FH->WriteBundle(BufferStream, **Input))
       return Err;
-    if (Error Err = FH->WriteBundleEnd(OutputFile, Triple))
+    if (Error Err = FH->WriteBundleEnd(BufferStream, Triple))
       return Err;
     ++Input;
   }
-  return Error::success();
+
+  raw_fd_ostream OutputFile(BundlerConfig.OutputFileNames.front(), EC,
+                            sys::fs::OF_None);
+  if (EC)
+    return createFileError(BundlerConfig.OutputFileNames.front(), EC);
+
+  SmallVector<char, 0> CompressedBuffer;
+  if (BundlerConfig.Compress) {
+    std::unique_ptr<llvm::MemoryBuffer> BufferMemory =
+        llvm::MemoryBuffer::getMemBufferCopy(
+            llvm::StringRef(Buffer.data(), Buffer.size()));
+    auto CompressionResult =
+        CompressedOffloadBundle::compress(*BufferMemory, BundlerConfig.Verbose);
+    if (auto Error = CompressionResult.takeError())
+      return Error;
+
+    auto CompressedMemBuffer = std::move(CompressionResult.get());
+    CompressedBuffer.assign(CompressedMemBuffer->getBufferStart(),
+                            CompressedMemBuffer->getBufferEnd());
+  } else
+    CompressedBuffer = Buffer;
+
+  OutputFile.write(CompressedBuffer.data(), CompressedBuffer.size());
+
+  return FH->finalizeOutputFile();
 }
 
 // Unbundle the files. Return true if an error was found.
@@ -964,7 +1195,16 @@ Error OffloadBundler::UnbundleFiles() {
   if (std::error_code EC = CodeOrErr.getError())
     return createFileError(BundlerConfig.InputFileNames.front(), EC);
 
-  MemoryBuffer &Input = **CodeOrErr;
+  // Decompress the input if necessary.
+  Expected<std::unique_ptr<MemoryBuffer>> DecompressedBufferOrErr =
+      CompressedOffloadBundle::decompress(**CodeOrErr, BundlerConfig.Verbose);
+  if (!DecompressedBufferOrErr)
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to decompress input: " +
+            llvm::toString(DecompressedBufferOrErr.takeError()));
+
+  MemoryBuffer &Input = **DecompressedBufferOrErr;
 
   // Select the right files handler.
   Expected<std::unique_ptr<FileHandler>> FileHandlerOrErr =
@@ -1169,11 +1409,23 @@ Error OffloadBundler::UnbundleArchive() {
     if (!CodeObjectBufferRefOrErr)
       return CodeObjectBufferRefOrErr.takeError();
 
-    auto CodeObjectBuffer =
+    auto TempCodeObjectBuffer =
         MemoryBuffer::getMemBuffer(*CodeObjectBufferRefOrErr, false);
 
+    // Decompress the buffer if necessary.
+    Expected<std::unique_ptr<MemoryBuffer>> DecompressedBufferOrErr =
+        CompressedOffloadBundle::decompress(*TempCodeObjectBuffer,
+                                            BundlerConfig.Verbose);
+    if (!DecompressedBufferOrErr)
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Failed to decompress code object: " +
+              llvm::toString(DecompressedBufferOrErr.takeError()));
+
+    MemoryBuffer &CodeObjectBuffer = **DecompressedBufferOrErr;
+
     Expected<std::unique_ptr<FileHandler>> FileHandlerOrErr =
-        CreateFileHandler(*CodeObjectBuffer, BundlerConfig);
+        CreateFileHandler(CodeObjectBuffer, BundlerConfig);
     if (!FileHandlerOrErr)
       return FileHandlerOrErr.takeError();
 
@@ -1181,11 +1433,11 @@ Error OffloadBundler::UnbundleArchive() {
     assert(FileHandler &&
            "FileHandle creation failed for file in the archive!");
 
-    if (Error ReadErr = FileHandler->ReadHeader(*CodeObjectBuffer))
+    if (Error ReadErr = FileHandler->ReadHeader(CodeObjectBuffer))
       return ReadErr;
 
     Expected<std::optional<StringRef>> CurBundleIDOrErr =
-        FileHandler->ReadBundleStart(*CodeObjectBuffer);
+        FileHandler->ReadBundleStart(CodeObjectBuffer);
     if (!CurBundleIDOrErr)
       return CurBundleIDOrErr.takeError();
 
@@ -1206,7 +1458,7 @@ Error OffloadBundler::UnbundleArchive() {
                                              BundlerConfig)) {
         std::string BundleData;
         raw_string_ostream DataStream(BundleData);
-        if (Error Err = FileHandler->ReadBundle(DataStream, *CodeObjectBuffer))
+        if (Error Err = FileHandler->ReadBundle(DataStream, CodeObjectBuffer))
           return Err;
 
         for (auto &CompatibleTarget : CompatibleTargets) {
@@ -1244,11 +1496,11 @@ Error OffloadBundler::UnbundleArchive() {
         }
       }
 
-      if (Error Err = FileHandler->ReadBundleEnd(*CodeObjectBuffer))
+      if (Error Err = FileHandler->ReadBundleEnd(CodeObjectBuffer))
         return Err;
 
       Expected<std::optional<StringRef>> NextTripleOrErr =
-          FileHandler->ReadBundleStart(*CodeObjectBuffer);
+          FileHandler->ReadBundleStart(CodeObjectBuffer);
       if (!NextTripleOrErr)
         return NextTripleOrErr.takeError();
 
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 3f08c0ef5d6f00..d4b33ad551c433 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -329,6 +329,20 @@ RocmInstallationDetector::RocmInstallationDetector(
   RocmDeviceLibPathArg =
       Args.getAllArgValues(clang::driver::options::OPT_rocm_device_lib_path_EQ);
   HIPPathArg = Args.getLastArgValue(clang::driver::options::OPT_hip_path_EQ);
+  HIPStdParPathArg =
+    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_path_EQ);
+  HasHIPStdParLibrary =
+    !HIPStdParPathArg.empty() && D.getVFS().exists(HIPStdParPathArg +
+                                                   "/hipstdpar_lib.hpp");
+  HIPRocThrustPathArg =
+    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_thrust_path_EQ);
+  HasRocThrustLibrary = !HIPRocThrustPathArg.empty() &&
+                        D.getVFS().exists(HIPRocThrustPathArg + "/thrust");
+  HIPRocPrimPathArg =
+    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_prim_path_EQ);
+  HasRocPrimLibrary = !HIPRocPrimPathArg.empty() &&
+                      D.getVFS().exists(HIPRocPrimPathArg + "/rocprim");
+
   if (auto *A = Args.getLastArg(clang::driver::options::OPT_hip_version_EQ)) {
     HIPVersionArg = A->getValue();
     unsigned Major = ~0U;
@@ -507,6 +521,7 @@ void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                                  ArgStringList &CC1Args) const {
   bool UsesRuntimeWrapper = VersionMajorMinor > llvm::VersionTuple(3, 5) &&
                             !DriverArgs.hasArg(options::OPT_nohipwrapperinc);
+  bool HasHipStdPar = DriverArgs.hasArg(options::OPT_hipstdpar);
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
     // HIP header includes standard library wrapper headers under clang
@@ -529,8 +544,45 @@ void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
     CC1Args.push_back(DriverArgs.MakeArgString(P));
   }
 
-  if (DriverArgs.hasArg(options::OPT_nogpuinc))
+  const auto HandleHipStdPar = [=, &DriverArgs, &CC1Args]() {
+    if (!hasHIPStdParLibrary()) {
+      D.Diag(diag::err_drv_no_hipstdpar_lib);
+      return;
+    }
+    if (!HasRocThrustLibrary &&
+        !D.getVFS().exists(getIncludePath() + "/thrust")) {
+      D.Diag(diag::err_drv_no_hipstdpar_thrust_lib);
+      return;
+    }
+    if (!HasRocPrimLibrary &&
+        !D.getVFS().exists(getIncludePath() + "/rocprim")) {
+      D.Diag(diag::err_drv_no_hipstdpar_prim_lib);
+      return;
+    }
+
+    const char *ThrustPath;
+    if (HasRocThrustLibrary)
+      ThrustPath = DriverArgs.MakeArgString(HIPRocThrustPathArg);
+    else
+      ThrustPath = DriverArgs.MakeArgString(getIncludePath() + "/thrust");
+
+    const char *PrimPath;
+    if (HasRocPrimLibrary)
+      PrimPath = DriverArgs.MakeArgString(HIPRocPrimPathArg);
+    else
+      PrimPath = DriverArgs.MakeArgString(getIncludePath() + "/rocprim");
+
+    CC1Args.append({"-idirafter", ThrustPath, "-idirafter", PrimPath,
+                    "-idirafter", DriverArgs.MakeArgString(HIPStdParPathArg),
+                    "-include", "hipstdpar_lib.hpp"});
+  };
+
+  if (DriverArgs.hasArg(options::OPT_nogpuinc)) {
+    if (HasHipStdPar)
+      HandleHipStdPar();
+
     return;
+  }
 
   if (!hasHIPRuntime()) {
     D.Diag(diag::err_drv_no_hip_runtime);
@@ -541,6 +593,8 @@ void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
   CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
   if (UsesRuntimeWrapper)
     CC1Args.append({"-include", "__clang_hip_runtime_wrapper.h"});
+  if (HasHipStdPar)
+    HandleHipStdPar();
 }
 
 void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a694d00b569a59..c74f6ff447261d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5150,9 +5150,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   const bool IsAssertBuild = true;
 #endif
 
-  // Disable the verification pass in -asserts builds.
-  if (!IsAssertBuild)
+  // Disable the verification pass in asserts builds unless otherwise specified.
+  if (Args.hasFlag(options::OPT_fno_verify_intermediate_code,
+                   options::OPT_fverify_intermediate_code, !IsAssertBuild)) {
     CmdArgs.push_back("-disable-llvm-verifier");
+  }
 
   // Discard value names in assert builds unless otherwise specified.
   if (Args.hasFlag(options::OPT_fdiscard_value_names,
@@ -6580,6 +6582,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-fhip-new-launch-api");
     Args.addOptInFlag(CmdArgs, options::OPT_fgpu_allow_device_init,
                       options::OPT_fno_gpu_allow_device_init);
+    Args.AddLastArg(CmdArgs, options::OPT_hipstdpar);
+    Args.AddLastArg(CmdArgs, options::OPT_hipstdpar_interpose_alloc);
     Args.addOptInFlag(CmdArgs, options::OPT_fhip_kernel_arg_name,
                       options::OPT_fno_hip_kernel_arg_name);
   }
@@ -8468,6 +8472,11 @@ void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
     }
     CmdArgs.push_back(TCArgs.MakeArgString(UB));
   }
+  if (TCArgs.hasFlag(options::OPT_offload_compress,
+                     options::OPT_no_offload_compress, false))
+    CmdArgs.push_back("-compress");
+  if (TCArgs.hasArg(options::OPT_v))
+    CmdArgs.push_back("-verbose");
   // All the inputs are encoded as commands.
   C.addCommand(std::make_unique<Command>(
       JA, *this, ResponseFileSupport::None(),
@@ -8551,6 +8560,8 @@ void OffloadBundler::ConstructJobMultipleOutputs(
   }
   CmdArgs.push_back("-unbundle");
   CmdArgs.push_back("-allow-missing-bundles");
+  if (TCArgs.hasArg(options::OPT_v))
+    CmdArgs.push_back("-verbose");
 
   // All the inputs are encoded as commands.
   C.addCommand(std::make_unique<Command>(
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 1ba031d42741cc..25fd940584624e 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -631,6 +631,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
 
   const char *PluginOptPrefix = IsOSAIX ? "-bplugin_opt:" : "-plugin-opt=";
   const char *ExtraDash = IsOSAIX ? "-" : "";
+  const char *ParallelismOpt = IsOSAIX ? "-threads=" : "jobs=";
 
   // Note, this solution is far from perfect, better to encode it into IR
   // metadata, but this may not be worth it, since it looks like aranges is on
@@ -690,8 +691,8 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
 
   StringRef Parallelism = getLTOParallelism(Args, D);
   if (!Parallelism.empty())
-    CmdArgs.push_back(
-        Args.MakeArgString(Twine(PluginOptPrefix) + "jobs=" + Parallelism));
+    CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) +
+                                         ParallelismOpt + Parallelism));
 
   // If an explicit debugger tuning argument appeared, pass it along.
   if (Arg *A =
@@ -969,41 +970,29 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC,
                                   llvm::opt::ArgStringList &CmdArgs) {
   if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
     CmdArgs.push_back("Fortran_main.lib");
-    CmdArgs.push_back("flang-rt.lib");
+    CmdArgs.push_back("FortranRuntime.lib");
+    CmdArgs.push_back("FortranDecimal.lib");
   } else {
     CmdArgs.push_back("-lFortran_main");
-    CmdArgs.push_back("-lflang-rt");
+    CmdArgs.push_back("-lFortranRuntime");
+    CmdArgs.push_back("-lFortranDecimal");
   }
 }
 
 void tools::addFortranRuntimeLibraryPath(const ToolChain &TC,
                                          const llvm::opt::ArgList &Args,
                                          ArgStringList &CmdArgs) {
-  // Default to the <driver-path>/../lib, <driver-path>/../flang-rt/lib, and
-  // <driver-path>/../runtimes/runtimes-bins/flang-rt/lib directories. This
-  // works fine on the platforms that we have tested so far. We will probably
-  // have to re-fine this in the future. In particular, on some platforms, we
-  // may need to use lib64 instead of lib.
-  SmallString<256> BuildLibPath =
-      llvm::sys::path::parent_path(TC.getDriver().Dir);
-  SmallString<256> FlangRTLibPath =
-      llvm::sys::path::parent_path(TC.getDriver().Dir);
-  SmallString<256> RuntimesLibPath =
+  // Default to the <driver-path>/../lib directory. This works fine on the
+  // platforms that we have tested so far. We will probably have to re-fine
+  // this in the future. In particular, on some platforms, we may need to use
+  // lib64 instead of lib.
+  SmallString<256> DefaultLibPath =
       llvm::sys::path::parent_path(TC.getDriver().Dir);
-  // Search path for Fortran_main and Flang-rt libraries.
-  llvm::sys::path::append(BuildLibPath, "lib");
-  llvm::sys::path::append(FlangRTLibPath, "flang-rt/lib");
-  llvm::sys::path::append(RuntimesLibPath,
-                          "runtimes/runtimes-bins/flang-rt/lib");
-  if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
-    CmdArgs.push_back(Args.MakeArgString("-libpath:" + BuildLibPath));
-    CmdArgs.push_back(Args.MakeArgString("-libpath:" + FlangRTLibPath));
-    CmdArgs.push_back(Args.MakeArgString("-libpath:" + RuntimesLibPath));
-  } else {
-    CmdArgs.push_back(Args.MakeArgString("-L" + BuildLibPath));
-    CmdArgs.push_back(Args.MakeArgString("-L" + FlangRTLibPath));
-    CmdArgs.push_back(Args.MakeArgString("-L" + RuntimesLibPath));
-  }
+  llvm::sys::path::append(DefaultLibPath, "lib");
+  if (TC.getTriple().isKnownWindowsMSVCEnvironment())
+    CmdArgs.push_back(Args.MakeArgString("-libpath:" + DefaultLibPath));
+  else
+    CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath));
 }
 
 static void addSanitizerRuntime(const ToolChain &TC, const ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 10f785ce7ab907..fe44939741d8dd 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "Flang.h"
 #include "CommonArgs.h"
 
@@ -170,6 +169,38 @@ void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const {
   }
 }
 
+void Flang::AddAArch64TargetArgs(const ArgList &Args,
+                                 ArgStringList &CmdArgs) const {
+  // Handle -msve_vector_bits=<bits>
+  if (Arg *A = Args.getLastArg(options::OPT_msve_vector_bits_EQ)) {
+    StringRef Val = A->getValue();
+    const Driver &D = getToolChain().getDriver();
+    if (Val.equals("128") || Val.equals("256") || Val.equals("512") ||
+        Val.equals("1024") || Val.equals("2048") || Val.equals("128+") ||
+        Val.equals("256+") || Val.equals("512+") || Val.equals("1024+") ||
+        Val.equals("2048+")) {
+      unsigned Bits = 0;
+      if (Val.endswith("+"))
+        Val = Val.substr(0, Val.size() - 1);
+      else {
+        [[maybe_unused]] bool Invalid = Val.getAsInteger(10, Bits);
+        assert(!Invalid && "Failed to parse value");
+        CmdArgs.push_back(
+            Args.MakeArgString("-mvscale-max=" + llvm::Twine(Bits / 128)));
+      }
+
+      [[maybe_unused]] bool Invalid = Val.getAsInteger(10, Bits);
+      assert(!Invalid && "Failed to parse value");
+      CmdArgs.push_back(
+          Args.MakeArgString("-mvscale-min=" + llvm::Twine(Bits / 128)));
+      // Silently drop requests for vector-length agnostic code as it's implied.
+    } else if (!Val.equals("scalable"))
+      // Handle the unsupported values passed to msve-vector-bits.
+      D.Diag(diag::err_drv_unsupported_option_argument)
+          << A->getSpelling() << Val;
+  }
+}
+
 void Flang::addTargetOptions(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   const ToolChain &TC = getToolChain();
@@ -186,9 +217,13 @@ void Flang::addTargetOptions(const ArgList &Args,
   switch (TC.getArch()) {
   default:
     break;
+  case llvm::Triple::aarch64:
+    getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
+    AddAArch64TargetArgs(Args, CmdArgs);
+    break;
+
   case llvm::Triple::r600:
   case llvm::Triple::amdgcn:
-  case llvm::Triple::aarch64:
   case llvm::Triple::riscv64:
   case llvm::Triple::x86_64:
     getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
diff --git a/clang/lib/Driver/ToolChains/Flang.h b/clang/lib/Driver/ToolChains/Flang.h
index 962b4ae601726a..0141240b5d3ac9 100644
--- a/clang/lib/Driver/ToolChains/Flang.h
+++ b/clang/lib/Driver/ToolChains/Flang.h
@@ -56,6 +56,13 @@ class LLVM_LIBRARY_VISIBILITY Flang : public Tool {
   void addTargetOptions(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const;
 
+  /// Add specific options for AArch64 target.
+  ///
+  /// \param [in] Args The list of input driver arguments
+  /// \param [out] CmdArgs The list of output command arguments
+  void AddAArch64TargetArgs(const llvm::opt::ArgList &Args,
+                            llvm::opt::ArgStringList &CmdArgs) const;
+
   /// Extract offload options from the driver arguments and add them to
   /// the command arguments.
   /// \param [in] C The current compilation for the driver invocation
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 3fc5669c06c399..ccb36a6c846c80 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -113,6 +113,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
                         "--no-undefined",
                         "-shared",
                         "-plugin-opt=-amdgpu-internalize-symbols"};
+  if (Args.hasArg(options::OPT_hipstdpar))
+    LldArgs.push_back("-plugin-opt=-amdgpu-enable-hipstdpar");
 
   auto &TC = getToolChain();
   auto &D = TC.getDriver();
@@ -242,6 +244,8 @@ void HIPAMDToolChain::addClangTargetOptions(
   if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                           false))
     CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
+  if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
+    CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
 
   StringRef MaxThreadsPerBlock =
       DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index 8b9d8db90ffab7..04efdcba20ea74 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -84,6 +84,12 @@ void HIP::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
       Args.MakeArgString(std::string("-output=").append(Output));
   BundlerArgs.push_back(BundlerOutputArg);
 
+  if (Args.hasFlag(options::OPT_offload_compress,
+                   options::OPT_no_offload_compress, false))
+    BundlerArgs.push_back("-compress");
+  if (Args.hasArg(options::OPT_v))
+    BundlerArgs.push_back("-verbose");
+
   const char *Bundler = Args.MakeArgString(
       T.getToolChain().GetProgramPath("clang-offload-bundler"));
   C.addCommand(std::make_unique<Command>(
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 5af7ea985a28a1..5872e13bda358f 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -709,8 +709,11 @@ void toolchains::MinGW::addClangTargetOptions(
     }
   }
 
-  if (Arg *A = DriverArgs.getLastArgNoClaim(options::OPT_mthreads))
-    A->ignoreTargetSpecific();
+  for (auto Opt : {options::OPT_mthreads, options::OPT_mwindows,
+                   options::OPT_mconsole, options::OPT_mdll}) {
+    if (Arg *A = DriverArgs.getLastArgNoClaim(Opt))
+      A->ignoreTargetSpecific();
+  }
 }
 
 void toolchains::MinGW::AddClangCXXStdlibIncludeArgs(
diff --git a/clang/lib/Driver/ToolChains/ROCm.h b/clang/lib/Driver/ToolChains/ROCm.h
index 554d8a6929ac59..dceb0ab0366933 100644
--- a/clang/lib/Driver/ToolChains/ROCm.h
+++ b/clang/lib/Driver/ToolChains/ROCm.h
@@ -77,6 +77,9 @@ class RocmInstallationDetector {
   const Driver &D;
   bool HasHIPRuntime = false;
   bool HasDeviceLibrary = false;
+  bool HasHIPStdParLibrary = false;
+  bool HasRocThrustLibrary = false;
+  bool HasRocPrimLibrary = false;
 
   // Default version if not detected or specified.
   const unsigned DefaultVersionMajor = 3;
@@ -96,6 +99,13 @@ class RocmInstallationDetector {
   std::vector<std::string> RocmDeviceLibPathArg;
   // HIP runtime path specified by --hip-path.
   StringRef HIPPathArg;
+  // HIP Standard Parallel Algorithm acceleration library specified by
+  // --hipstdpar-path
+  StringRef HIPStdParPathArg;
+  // rocThrust algorithm library specified by --hipstdpar-thrust-path
+  StringRef HIPRocThrustPathArg;
+  // rocPrim algorithm library specified by --hipstdpar-prim-path
+  StringRef HIPRocPrimPathArg;
   // HIP version specified by --hip-version.
   StringRef HIPVersionArg;
   // Wheter -nogpulib is specified.
@@ -180,6 +190,9 @@ class RocmInstallationDetector {
   /// Check whether we detected a valid ROCm device library.
   bool hasDeviceLibrary() const { return HasDeviceLibrary; }
 
+  /// Check whether we detected a valid HIP STDPAR Acceleration library.
+  bool hasHIPStdParLibrary() const { return HasHIPStdParLibrary; }
+
   /// Print information about the detected ROCm installation.
   void print(raw_ostream &OS) const;
 
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 8a34b4d9431b62..7b0ebe2cf621b1 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3914,7 +3914,7 @@ llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
 
   FormatStyle FallbackStyle = getNoStyle();
   if (!getPredefinedStyle(FallbackStyleName, Style.Language, &FallbackStyle))
-    return make_string_error("Invalid fallback style \"" + FallbackStyleName);
+    return make_string_error("Invalid fallback style: " + FallbackStyleName);
 
   llvm::SmallVector<std::unique_ptr<llvm::MemoryBuffer>, 1>
       ChildFormatTextToApply;
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index dbd3a6e70f037e..5877b0a6124742 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -61,6 +61,7 @@ namespace format {
   TYPE(CSharpStringLiteral)                                                    \
   TYPE(CtorInitializerColon)                                                   \
   TYPE(CtorInitializerComma)                                                   \
+  TYPE(CtorDtorDeclName)                                                       \
   TYPE(DesignatedInitializerLSquare)                                           \
   TYPE(DesignatedInitializerPeriod)                                            \
   TYPE(DictLiteral)                                                            \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index ae2cbbdce93461..6f879006465ecf 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2016,8 +2016,7 @@ class AnnotatingParser {
                Style.Language == FormatStyle::LK_Java) {
       Current.setType(TT_LambdaArrow);
     } else if (Current.is(tok::arrow) && AutoFound &&
-               (Line.MightBeFunctionDecl || Line.InPPDirective) &&
-               Current.NestingLevel == 0 &&
+               Line.MightBeFunctionDecl && Current.NestingLevel == 0 &&
                !Current.Previous->isOneOf(tok::kw_operator, tok::identifier)) {
       // not auto operator->() -> xxx;
       Current.setType(TT_TrailingReturnArrow);
@@ -3211,12 +3210,12 @@ static bool isCtorOrDtorName(const FormatToken *Tok) {
 }
 
 void TokenAnnotator::annotate(AnnotatedLine &Line) {
-  for (auto &Child : Line.Children)
-    annotate(*Child);
-
   AnnotatingParser Parser(Style, Line, Keywords, Scopes);
   Line.Type = Parser.parseLine();
 
+  for (auto &Child : Line.Children)
+    annotate(*Child);
+
   // With very deep nesting, ExpressionParser uses lots of stack and the
   // formatting algorithm is very slow. We're not going to do a good job here
   // anyway - it's probably generated code being formatted by mistake.
@@ -3234,7 +3233,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
     auto *Tok = getFunctionName(Line);
     if (Tok && ((!Scopes.empty() && Scopes.back() == ST_Class) ||
                 Line.endsWith(TT_FunctionLBrace) || isCtorOrDtorName(Tok))) {
-      Tok->setFinalizedType(TT_FunctionDeclarationName);
+      Tok->setFinalizedType(TT_CtorDtorDeclName);
     }
   }
 
@@ -3252,7 +3251,8 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
 // This function heuristically determines whether 'Current' starts the name of a
 // function declaration.
 static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
-                                      const AnnotatedLine &Line) {
+                                      const AnnotatedLine &Line,
+                                      FormatToken *&ClosingParen) {
   assert(Current.Previous);
 
   if (Current.is(TT_FunctionDeclarationName))
@@ -3344,16 +3344,16 @@ static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
   // Check whether parameter list can belong to a function declaration.
   if (!Next || Next->isNot(tok::l_paren) || !Next->MatchingParen)
     return false;
+  ClosingParen = Next->MatchingParen;
+  assert(ClosingParen->is(tok::r_paren));
   // If the lines ends with "{", this is likely a function definition.
   if (Line.Last->is(tok::l_brace))
     return true;
-  if (Next->Next == Next->MatchingParen)
+  if (Next->Next == ClosingParen)
     return true; // Empty parentheses.
   // If there is an &/&& after the r_paren, this is likely a function.
-  if (Next->MatchingParen->Next &&
-      Next->MatchingParen->Next->is(TT_PointerOrReference)) {
+  if (ClosingParen->Next && ClosingParen->Next->is(TT_PointerOrReference))
     return true;
-  }
 
   // Check for K&R C function definitions (and C++ function definitions with
   // unnamed parameters), e.g.:
@@ -3370,7 +3370,7 @@ static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
     return true;
   }
 
-  for (const FormatToken *Tok = Next->Next; Tok && Tok != Next->MatchingParen;
+  for (const FormatToken *Tok = Next->Next; Tok && Tok != ClosingParen;
        Tok = Tok->Next) {
     if (Tok->is(TT_TypeDeclarationParen))
       return true;
@@ -3442,13 +3442,18 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     calculateArrayInitializerColumnList(Line);
 
   bool LineIsFunctionDeclaration = false;
+  FormatToken *ClosingParen = nullptr;
   for (FormatToken *Tok = Current, *AfterLastAttribute = nullptr; Tok;
        Tok = Tok->Next) {
     if (Tok->Previous->EndsCppAttributeGroup)
       AfterLastAttribute = Tok;
-    if (isFunctionDeclarationName(Style.isCpp(), *Tok, Line)) {
-      LineIsFunctionDeclaration = true;
-      Tok->setFinalizedType(TT_FunctionDeclarationName);
+    if (const bool IsCtorOrDtor = Tok->is(TT_CtorDtorDeclName);
+        IsCtorOrDtor ||
+        isFunctionDeclarationName(Style.isCpp(), *Tok, Line, ClosingParen)) {
+      if (!IsCtorOrDtor) {
+        LineIsFunctionDeclaration = true;
+        Tok->setFinalizedType(TT_FunctionDeclarationName);
+      }
       if (AfterLastAttribute &&
           mustBreakAfterAttributes(*AfterLastAttribute, Style)) {
         AfterLastAttribute->MustBreakBefore = true;
@@ -3458,29 +3463,38 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     }
   }
 
-  if (Style.isCpp() && !LineIsFunctionDeclaration) {
-    // Annotate */&/&& in `operator` function calls as binary operators.
-    for (const auto *Tok = Line.First; Tok; Tok = Tok->Next) {
-      if (Tok->isNot(tok::kw_operator))
-        continue;
-      do {
-        Tok = Tok->Next;
-      } while (Tok && Tok->isNot(TT_OverloadedOperatorLParen));
-      if (!Tok)
-        break;
-      const auto *LeftParen = Tok;
-      for (Tok = Tok->Next; Tok && Tok != LeftParen->MatchingParen;
-           Tok = Tok->Next) {
-        if (Tok->isNot(tok::identifier))
-          continue;
-        auto *Next = Tok->Next;
-        const bool NextIsBinaryOperator =
-            Next && Next->isOneOf(tok::star, tok::amp, tok::ampamp) &&
-            Next->Next && Next->Next->is(tok::identifier);
-        if (!NextIsBinaryOperator)
+  if (Style.isCpp()) {
+    if (!LineIsFunctionDeclaration) {
+      // Annotate */&/&& in `operator` function calls as binary operators.
+      for (const auto *Tok = Line.First; Tok; Tok = Tok->Next) {
+        if (Tok->isNot(tok::kw_operator))
           continue;
-        Next->setType(TT_BinaryOperator);
-        Tok = Next;
+        do {
+          Tok = Tok->Next;
+        } while (Tok && Tok->isNot(TT_OverloadedOperatorLParen));
+        if (!Tok)
+          break;
+        const auto *LeftParen = Tok;
+        for (Tok = Tok->Next; Tok && Tok != LeftParen->MatchingParen;
+             Tok = Tok->Next) {
+          if (Tok->isNot(tok::identifier))
+            continue;
+          auto *Next = Tok->Next;
+          const bool NextIsBinaryOperator =
+              Next && Next->isOneOf(tok::star, tok::amp, tok::ampamp) &&
+              Next->Next && Next->Next->is(tok::identifier);
+          if (!NextIsBinaryOperator)
+            continue;
+          Next->setType(TT_BinaryOperator);
+          Tok = Next;
+        }
+      }
+    } else if (ClosingParen) {
+      for (auto *Tok = ClosingParen->Next; Tok; Tok = Tok->Next) {
+        if (Tok->is(tok::arrow)) {
+          Tok->setType(TT_TrailingReturnArrow);
+          break;
+        }
       }
     }
   }
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index bf38ed73c4a0b4..dda5fd077e590e 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -173,10 +173,12 @@ void UnwrappedLineParser::reset() {
   CommentsBeforeNextToken.clear();
   FormatTok = nullptr;
   MustBreakBeforeNextToken = false;
+  IsDecltypeAutoFunction = false;
   PreprocessorDirectives.clear();
   CurrentLines = &Lines;
   DeclarationScopeStack.clear();
   NestedTooDeep.clear();
+  NestedLambdas.clear();
   PPStack.clear();
   Line->FirstStartColumn = FirstStartColumn;
 
@@ -1766,6 +1768,17 @@ void UnwrappedLineParser::parseStructuralElement(
       if (parseStructLike())
         return;
       break;
+    case tok::kw_decltype:
+      nextToken();
+      if (FormatTok->is(tok::l_paren)) {
+        parseParens();
+        assert(FormatTok->Previous);
+        if (FormatTok->Previous->endsSequence(tok::r_paren, tok::kw_auto,
+                                              tok::l_paren)) {
+          Line->SeenDecltypeAuto = true;
+        }
+      }
+      break;
     case tok::period:
       nextToken();
       // In Java, classes have an implicit static member "class".
@@ -1827,6 +1840,7 @@ void UnwrappedLineParser::parseStructuralElement(
       if (InRequiresExpression)
         FormatTok->setFinalizedType(TT_BracedListLBrace);
       if (!tryToParsePropertyAccessor() && !tryToParseBracedList()) {
+        IsDecltypeAutoFunction = Line->SeenDecltypeAuto;
         // A block outside of parentheses must be the last part of a
         // structural element.
         // FIXME: Figure out cases where this is not true, and add projections
@@ -1844,6 +1858,7 @@ void UnwrappedLineParser::parseStructuralElement(
         }
         FormatTok->setFinalizedType(TT_FunctionLBrace);
         parseBlock();
+        IsDecltypeAutoFunction = false;
         addUnwrappedLine();
         return;
       }
@@ -2249,9 +2264,15 @@ bool UnwrappedLineParser::tryToParseLambda() {
       return true;
     }
   }
+
   FormatTok->setFinalizedType(TT_LambdaLBrace);
   LSquare.setFinalizedType(TT_LambdaLSquare);
+
+  NestedLambdas.push_back(Line->SeenDecltypeAuto);
   parseChildBlock();
+  assert(!NestedLambdas.empty());
+  NestedLambdas.pop_back();
+
   return true;
 }
 
@@ -2469,6 +2490,8 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
                PrevPrev->endsSequence(tok::kw_constexpr, tok::kw_if))));
         const bool ReturnParens =
             Style.RemoveParentheses == FormatStyle::RPS_ReturnStatement &&
+            ((NestedLambdas.empty() && !IsDecltypeAutoFunction) ||
+             (!NestedLambdas.empty() && !NestedLambdas.back())) &&
             Prev && Prev->isOneOf(tok::kw_return, tok::kw_co_return) && Next &&
             Next->is(tok::semi);
         if ((DoubleParens && !Blacklisted) || ReturnParens) {
@@ -4379,6 +4402,7 @@ void UnwrappedLineParser::addUnwrappedLine(LineLevel AdjustLevel) {
   Line->MatchingOpeningBlockLineIndex = UnwrappedLine::kInvalidIndex;
   Line->FirstStartColumn = 0;
   Line->IsContinuation = false;
+  Line->SeenDecltypeAuto = false;
 
   if (ClosesWhitesmithsBlock && AdjustLevel == LineLevel::Remove)
     --Line->Level;
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 4138baaabe2693..a4f150d1957126 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -61,6 +61,9 @@ struct UnwrappedLine {
 
   bool MustBeDeclaration = false;
 
+  /// Whether the parser has seen \c decltype(auto) in this line.
+  bool SeenDecltypeAuto = false;
+
   /// \c True if this line should be indented by ContinuationIndent in
   /// addition to the normal indention level.
   bool IsContinuation = false;
@@ -335,6 +338,14 @@ class UnwrappedLineParser {
   // statement contains more than some predefined number of nested statements).
   SmallVector<bool, 8> NestedTooDeep;
 
+  // Keeps a stack of the states of nested lambdas (true if the return type of
+  // the lambda is `decltype(auto)`).
+  SmallVector<bool, 4> NestedLambdas;
+
+  // Whether the parser is parsing the body of a function whose return type is
+  // `decltype(auto)`.
+  bool IsDecltypeAutoFunction = false;
+
   // Represents preprocessor branch type, so we can find matching
   // #if/#else/#endif directives.
   enum PPBranchKind {
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 762729d1c4166a..dc81060671c171 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -307,8 +307,9 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
   SmallVector<unsigned, 16> ScopeStack;
 
   for (unsigned i = Start; i != End; ++i) {
+    auto &CurrentChange = Changes[i];
     if (ScopeStack.size() != 0 &&
-        Changes[i].indentAndNestingLevel() <
+        CurrentChange.indentAndNestingLevel() <
             Changes[ScopeStack.back()].indentAndNestingLevel()) {
       ScopeStack.pop_back();
     }
@@ -320,18 +321,18 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
            Changes[PreviousNonComment].Tok->is(tok::comment)) {
       --PreviousNonComment;
     }
-    if (i != Start && Changes[i].indentAndNestingLevel() >
+    if (i != Start && CurrentChange.indentAndNestingLevel() >
                           Changes[PreviousNonComment].indentAndNestingLevel()) {
       ScopeStack.push_back(i);
     }
 
     bool InsideNestedScope = ScopeStack.size() != 0;
     bool ContinuedStringLiteral = i > Start &&
-                                  Changes[i].Tok->is(tok::string_literal) &&
+                                  CurrentChange.Tok->is(tok::string_literal) &&
                                   Changes[i - 1].Tok->is(tok::string_literal);
     bool SkipMatchCheck = InsideNestedScope || ContinuedStringLiteral;
 
-    if (Changes[i].NewlinesBefore > 0 && !SkipMatchCheck) {
+    if (CurrentChange.NewlinesBefore > 0 && !SkipMatchCheck) {
       Shift = 0;
       FoundMatchOnLine = false;
     }
@@ -339,23 +340,26 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
     // If this is the first matching token to be aligned, remember by how many
     // spaces it has to be shifted, so the rest of the changes on the line are
     // shifted by the same amount
-    if (!FoundMatchOnLine && !SkipMatchCheck && Matches(Changes[i])) {
+    if (!FoundMatchOnLine && !SkipMatchCheck && Matches(CurrentChange)) {
       FoundMatchOnLine = true;
-      Shift = Column - (RightJustify ? Changes[i].TokenLength : 0) -
-              Changes[i].StartOfTokenColumn;
-      Changes[i].Spaces += Shift;
+      Shift = Column - (RightJustify ? CurrentChange.TokenLength : 0) -
+              CurrentChange.StartOfTokenColumn;
+      CurrentChange.Spaces += Shift;
       // FIXME: This is a workaround that should be removed when we fix
       // http://llvm.org/PR53699. An assertion later below verifies this.
-      if (Changes[i].NewlinesBefore == 0) {
-        Changes[i].Spaces =
-            std::max(Changes[i].Spaces,
-                     static_cast<int>(Changes[i].Tok->SpacesRequiredBefore));
+      if (CurrentChange.NewlinesBefore == 0) {
+        CurrentChange.Spaces =
+            std::max(CurrentChange.Spaces,
+                     static_cast<int>(CurrentChange.Tok->SpacesRequiredBefore));
       }
     }
 
+    if (Shift == 0)
+      continue;
+
     // This is for function parameters that are split across multiple lines,
     // as mentioned in the ScopeStack comment.
-    if (InsideNestedScope && Changes[i].NewlinesBefore > 0) {
+    if (InsideNestedScope && CurrentChange.NewlinesBefore > 0) {
       unsigned ScopeStart = ScopeStack.back();
       auto ShouldShiftBeAdded = [&] {
         // Function declaration
@@ -372,35 +376,36 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
           return true;
         }
 
-        // Continued function call
+        // Continued (template) function call.
         if (ScopeStart > Start + 1 &&
-            Changes[ScopeStart - 2].Tok->is(tok::identifier) &&
+            Changes[ScopeStart - 2].Tok->isOneOf(tok::identifier,
+                                                 TT_TemplateCloser) &&
             Changes[ScopeStart - 1].Tok->is(tok::l_paren) &&
             Changes[ScopeStart].Tok->isNot(TT_LambdaLSquare)) {
-          if (Changes[i].Tok->MatchingParen &&
-              Changes[i].Tok->MatchingParen->is(TT_LambdaLBrace)) {
+          if (CurrentChange.Tok->MatchingParen &&
+              CurrentChange.Tok->MatchingParen->is(TT_LambdaLBrace)) {
             return false;
           }
           if (Changes[ScopeStart].NewlinesBefore > 0)
             return false;
-          if (Changes[i].Tok->is(tok::l_brace) &&
-              Changes[i].Tok->is(BK_BracedInit)) {
+          if (CurrentChange.Tok->is(tok::l_brace) &&
+              CurrentChange.Tok->is(BK_BracedInit)) {
             return true;
           }
           return Style.BinPackArguments;
         }
 
         // Ternary operator
-        if (Changes[i].Tok->is(TT_ConditionalExpr))
+        if (CurrentChange.Tok->is(TT_ConditionalExpr))
           return true;
 
         // Period Initializer .XXX = 1.
-        if (Changes[i].Tok->is(TT_DesignatedInitializerPeriod))
+        if (CurrentChange.Tok->is(TT_DesignatedInitializerPeriod))
           return true;
 
         // Continued ternary operator
-        if (Changes[i].Tok->Previous &&
-            Changes[i].Tok->Previous->is(TT_ConditionalExpr)) {
+        if (CurrentChange.Tok->Previous &&
+            CurrentChange.Tok->Previous->is(TT_ConditionalExpr)) {
           return true;
         }
 
@@ -408,8 +413,8 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
         if (ScopeStart > Start + 1 &&
             Changes[ScopeStart - 2].Tok->is(tok::identifier) &&
             Changes[ScopeStart - 1].Tok->is(tok::l_brace) &&
-            Changes[i].Tok->is(tok::l_brace) &&
-            Changes[i].Tok->is(BK_BracedInit)) {
+            CurrentChange.Tok->is(tok::l_brace) &&
+            CurrentChange.Tok->is(BK_BracedInit)) {
           return true;
         }
 
@@ -417,7 +422,7 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
         if (ScopeStart > Start + 1 &&
             Changes[ScopeStart - 2].Tok->isNot(tok::identifier) &&
             Changes[ScopeStart - 1].Tok->is(tok::l_brace) &&
-            Changes[i].Tok->isNot(tok::r_brace)) {
+            CurrentChange.Tok->isNot(tok::r_brace)) {
           for (unsigned OuterScopeStart : llvm::reverse(ScopeStack)) {
             // Lambda.
             if (OuterScopeStart > Start &&
@@ -438,26 +443,26 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
       };
 
       if (ShouldShiftBeAdded())
-        Changes[i].Spaces += Shift;
+        CurrentChange.Spaces += Shift;
     }
 
     if (ContinuedStringLiteral)
-      Changes[i].Spaces += Shift;
+      CurrentChange.Spaces += Shift;
 
     // We should not remove required spaces unless we break the line before.
-    assert(Shift >= 0 || Changes[i].NewlinesBefore > 0 ||
-           Changes[i].Spaces >=
+    assert(Shift > 0 || Changes[i].NewlinesBefore > 0 ||
+           CurrentChange.Spaces >=
                static_cast<int>(Changes[i].Tok->SpacesRequiredBefore) ||
-           Changes[i].Tok->is(tok::eof));
+           CurrentChange.Tok->is(tok::eof));
 
-    Changes[i].StartOfTokenColumn += Shift;
+    CurrentChange.StartOfTokenColumn += Shift;
     if (i + 1 != Changes.size())
       Changes[i + 1].PreviousEndOfTokenColumn += Shift;
 
     // If PointerAlignment is PAS_Right, keep *s or &s next to the token
     if ((Style.PointerAlignment == FormatStyle::PAS_Right ||
          Style.ReferenceAlignment == FormatStyle::RAS_Right) &&
-        Changes[i].Spaces != 0) {
+        CurrentChange.Spaces != 0) {
       const bool ReferenceNotRightAligned =
           Style.ReferenceAlignment != FormatStyle::RAS_Right &&
           Style.ReferenceAlignment != FormatStyle::RAS_Pointer;
@@ -578,16 +583,17 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
 
   unsigned i = StartAt;
   for (unsigned e = Changes.size(); i != e; ++i) {
-    if (Changes[i].indentAndNestingLevel() < IndentAndNestingLevel)
+    auto &CurrentChange = Changes[i];
+    if (CurrentChange.indentAndNestingLevel() < IndentAndNestingLevel)
       break;
 
-    if (Changes[i].NewlinesBefore != 0) {
+    if (CurrentChange.NewlinesBefore != 0) {
       CommasBeforeMatch = 0;
       EndOfSequence = i;
 
       // Whether to break the alignment sequence because of an empty line.
       bool EmptyLineBreak =
-          (Changes[i].NewlinesBefore > 1) && !ACS.AcrossEmptyLines;
+          (CurrentChange.NewlinesBefore > 1) && !ACS.AcrossEmptyLines;
 
       // Whether to break the alignment sequence because of a line without a
       // match.
@@ -599,19 +605,19 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
 
       // A new line starts, re-initialize line status tracking bools.
       // Keep the match state if a string literal is continued on this line.
-      if (i == 0 || Changes[i].Tok->isNot(tok::string_literal) ||
+      if (i == 0 || CurrentChange.Tok->isNot(tok::string_literal) ||
           Changes[i - 1].Tok->isNot(tok::string_literal)) {
         FoundMatchOnLine = false;
       }
       LineIsComment = true;
     }
 
-    if (Changes[i].Tok->isNot(tok::comment))
+    if (CurrentChange.Tok->isNot(tok::comment))
       LineIsComment = false;
 
-    if (Changes[i].Tok->is(tok::comma)) {
+    if (CurrentChange.Tok->is(tok::comma)) {
       ++CommasBeforeMatch;
-    } else if (Changes[i].indentAndNestingLevel() > IndentAndNestingLevel) {
+    } else if (CurrentChange.indentAndNestingLevel() > IndentAndNestingLevel) {
       // Call AlignTokens recursively, skipping over this scope block.
       unsigned StoppedAt =
           AlignTokens(Style, Matches, Changes, i, ACS, RightJustify);
@@ -619,7 +625,7 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
       continue;
     }
 
-    if (!Matches(Changes[i]))
+    if (!Matches(CurrentChange))
       continue;
 
     // If there is more than one matching token per line, or if the number of
@@ -633,16 +639,16 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
     if (StartOfSequence == 0)
       StartOfSequence = i;
 
-    unsigned ChangeWidthLeft = Changes[i].StartOfTokenColumn;
+    unsigned ChangeWidthLeft = CurrentChange.StartOfTokenColumn;
     unsigned ChangeWidthAnchor = 0;
     unsigned ChangeWidthRight = 0;
     if (RightJustify)
       if (ACS.PadOperators)
-        ChangeWidthAnchor = Changes[i].TokenLength;
+        ChangeWidthAnchor = CurrentChange.TokenLength;
       else
-        ChangeWidthLeft += Changes[i].TokenLength;
+        ChangeWidthLeft += CurrentChange.TokenLength;
     else
-      ChangeWidthRight = Changes[i].TokenLength;
+      ChangeWidthRight = CurrentChange.TokenLength;
     for (unsigned j = i + 1; j != e && Changes[j].NewlinesBefore == 0; ++j) {
       ChangeWidthRight += Changes[j].Spaces;
       // Changes are generally 1:1 with the tokens, but a change could also be
@@ -974,11 +980,7 @@ void WhitespaceManager::alignConsecutiveDeclarations() {
   AlignTokens(
       Style,
       [](Change const &C) {
-        if (C.Tok->is(TT_FunctionDeclarationName) && C.Tok->Previous &&
-            C.Tok->Previous->isNot(tok::tilde)) {
-          return true;
-        }
-        if (C.Tok->is(TT_FunctionTypeLParen))
+        if (C.Tok->isOneOf(TT_FunctionDeclarationName, TT_FunctionTypeLParen))
           return true;
         if (C.Tok->isNot(TT_StartOfName))
           return false;
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index e5db8a654e6707..846e5fce6de7b2 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -585,6 +585,11 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__HIP_MEMORY_SCOPE_WORKGROUP", "3");
     Builder.defineMacro("__HIP_MEMORY_SCOPE_AGENT", "4");
     Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
+    if (LangOpts.HIPStdPar) {
+      Builder.defineMacro("__HIPSTDPAR__");
+      if (LangOpts.HIPStdParInterposeAlloc)
+        Builder.defineMacro("__HIPSTDPAR_INTERPOSE_ALLOC__");
+    }
     if (LangOpts.CUDAIsDevice) {
       Builder.defineMacro("__HIP_DEVICE_COMPILE__");
       if (!TI.hasHIPImageSupport()) {
diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index 58aa55d7476903..11e1e7d032586f 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -14,9 +14,6 @@
 #endif
 
 #if !defined(__HIPCC_RTC__)
-#if defined(__cplusplus)
-#include <algorithm>
-#endif
 #include <limits.h>
 #include <stdint.h>
 #ifdef __OPENMP_AMDGCN__
@@ -1311,11 +1308,11 @@ double min(double __x, double __y) { return __builtin_fmin(__x, __y); }
 
 #if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 __host__ inline static int min(int __arg1, int __arg2) {
-  return std::min(__arg1, __arg2);
+  return __arg1 < __arg2 ? __arg1 : __arg2;
 }
 
 __host__ inline static int max(int __arg1, int __arg2) {
-  return std::max(__arg1, __arg2);
+  return __arg1 > __arg2 ? __arg1 : __arg2;
 }
 #endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 #endif
diff --git a/clang/lib/Headers/__stddef_max_align_t.h b/clang/lib/Headers/__stddef_max_align_t.h
index e3b439285d0fd0..512606a877288b 100644
--- a/clang/lib/Headers/__stddef_max_align_t.h
+++ b/clang/lib/Headers/__stddef_max_align_t.h
@@ -1,4 +1,4 @@
-/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
+/*===---- __stddef_max_align_t.h - Definition of max_align_t ---------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang/lib/Headers/__stddef_null.h b/clang/lib/Headers/__stddef_null.h
index 77d7a54025380b..7336fdab389723 100644
--- a/clang/lib/Headers/__stddef_null.h
+++ b/clang/lib/Headers/__stddef_null.h
@@ -7,7 +7,15 @@
  *===-----------------------------------------------------------------------===
  */
 
+#if !defined(NULL) || !__has_feature(modules)
+
+/* linux/stddef.h will define NULL to 0. glibc (and other) headers then define
+ * __need_NULL and rely on stddef.h to redefine NULL to the correct value again.
+ * Modules don't support redefining macros like that, but support that pattern
+ * in the non-modules case.
+ */
 #undef NULL
+
 #ifdef __cplusplus
 #if !defined(__MINGW32__) && !defined(_MSC_VER)
 #define NULL __null
@@ -17,3 +25,5 @@
 #else
 #define NULL ((void*)0)
 #endif
+
+#endif
diff --git a/clang/lib/Headers/__stddef_nullptr_t.h b/clang/lib/Headers/__stddef_nullptr_t.h
index 8d23ed6dc8c697..183d394d56c1b7 100644
--- a/clang/lib/Headers/__stddef_nullptr_t.h
+++ b/clang/lib/Headers/__stddef_nullptr_t.h
@@ -7,11 +7,9 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(_NULLPTR_T) || __has_feature(modules)
-/* Always define nullptr_t when modules are available. */
-#if !__has_feature(modules)
+#ifndef _NULLPTR_T
 #define _NULLPTR_T
-#endif
+
 #ifdef __cplusplus
 #if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
 namespace std {
@@ -19,7 +17,8 @@ typedef decltype(nullptr) nullptr_t;
 }
 using ::std::nullptr_t;
 #endif
-#else
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 typedef typeof(nullptr) nullptr_t;
 #endif
+
 #endif
diff --git a/clang/lib/Headers/__stddef_offsetof.h b/clang/lib/Headers/__stddef_offsetof.h
index 84af8357c285b5..3b347b3b92f62c 100644
--- a/clang/lib/Headers/__stddef_offsetof.h
+++ b/clang/lib/Headers/__stddef_offsetof.h
@@ -7,7 +7,6 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(offsetof) || __has_feature(modules)
-/* Always define offsetof when modules are available. */
+#ifndef offsetof
 #define offsetof(t, d) __builtin_offsetof(t, d)
 #endif
diff --git a/clang/lib/Headers/__stddef_ptrdiff_t.h b/clang/lib/Headers/__stddef_ptrdiff_t.h
index 853cee5e09799b..3ea6d7d2852e1c 100644
--- a/clang/lib/Headers/__stddef_ptrdiff_t.h
+++ b/clang/lib/Headers/__stddef_ptrdiff_t.h
@@ -7,10 +7,9 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(_PTRDIFF_T) || __has_feature(modules)
-/* Always define ptrdiff_t when modules are available. */
-#if !__has_feature(modules)
+#ifndef _PTRDIFF_T
 #define _PTRDIFF_T
-#endif
+
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
 #endif
diff --git a/clang/lib/Headers/__stddef_rsize_t.h b/clang/lib/Headers/__stddef_rsize_t.h
index f9c31cf8f9746a..b6428d0c12b62a 100644
--- a/clang/lib/Headers/__stddef_rsize_t.h
+++ b/clang/lib/Headers/__stddef_rsize_t.h
@@ -7,10 +7,9 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(_RSIZE_T) || __has_feature(modules)
-/* Always define rsize_t when modules are available. */
-#if !__has_feature(modules)
+#ifndef _RSIZE_T
 #define _RSIZE_T
-#endif
+
 typedef __SIZE_TYPE__ rsize_t;
+
 #endif
diff --git a/clang/lib/Headers/__stddef_size_t.h b/clang/lib/Headers/__stddef_size_t.h
index b30e245e0eb5c9..e4a389510bcdbf 100644
--- a/clang/lib/Headers/__stddef_size_t.h
+++ b/clang/lib/Headers/__stddef_size_t.h
@@ -7,10 +7,9 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(_SIZE_T) || __has_feature(modules)
-/* Always define size_t when modules are available. */
-#if !__has_feature(modules)
+#ifndef _SIZE_T
 #define _SIZE_T
-#endif
+
 typedef __SIZE_TYPE__ size_t;
+
 #endif
diff --git a/clang/lib/Headers/__stddef_unreachable.h b/clang/lib/Headers/__stddef_unreachable.h
index 5252932cd27698..3e7fe01979662a 100644
--- a/clang/lib/Headers/__stddef_unreachable.h
+++ b/clang/lib/Headers/__stddef_unreachable.h
@@ -7,7 +7,6 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(unreachable) || __has_feature(modules)
-/* Always define unreachable when modules are available. */
+#ifndef unreachable
 #define unreachable() __builtin_unreachable()
 #endif
diff --git a/clang/lib/Headers/__stddef_wchar_t.h b/clang/lib/Headers/__stddef_wchar_t.h
index c3f0516a4b1edf..16a6186512c0c3 100644
--- a/clang/lib/Headers/__stddef_wchar_t.h
+++ b/clang/lib/Headers/__stddef_wchar_t.h
@@ -8,14 +8,16 @@
  */
 
 #if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
-/* Always define wchar_t when modules are available. */
-#if !defined(_WCHAR_T) || __has_feature(modules)
-#if !__has_feature(modules)
+
+#ifndef _WCHAR_T
 #define _WCHAR_T
-#if defined(_MSC_EXTENSIONS)
+
+#ifdef _MSC_EXTENSIONS
 #define _WCHAR_T_DEFINED
 #endif
-#endif
+
 typedef __WCHAR_TYPE__ wchar_t;
+
 #endif
+
 #endif
diff --git a/clang/lib/Headers/__stddef_wint_t.h b/clang/lib/Headers/__stddef_wint_t.h
index d0582ef971efda..0aa2915079571a 100644
--- a/clang/lib/Headers/__stddef_wint_t.h
+++ b/clang/lib/Headers/__stddef_wint_t.h
@@ -7,10 +7,9 @@
  *===-----------------------------------------------------------------------===
  */
 
-/* Always define wint_t when modules are available. */
-#if !defined(_WINT_T) || __has_feature(modules)
-#if !__has_feature(modules)
+#ifndef _WINT_T
 #define _WINT_T
-#endif
+
 typedef __WINT_TYPE__ wint_t;
+
 #endif
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 9729786923c2d5..b796bb773ec11f 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -1569,6 +1569,15 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
   ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
                                      (__v4df)(__m256d)(b), (int)(mask)))
 
+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
@@ -1594,6 +1603,126 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
 
+/// Compares each of the corresponding double-precision values of two
+///    128-bit vectors of [2 x double], using the operation specified by the
+///    immediate integer operand.
+///
+///    Returns a [2 x double] vector consisting of two doubles corresponding to
+///    the two comparison results: zero if the comparison is false, and all 1's
+///    if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
+#define _mm_cmp_pd(a, b, c) \
+  ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                 (__v2df)(__m128d)(b), (c)))
+
+/// Compares each of the corresponding values of two 128-bit vectors of
+///    [4 x float], using the operation specified by the immediate integer
+///    operand.
+///
+///    Returns a [4 x float] vector consisting of four floats corresponding to
+///    the four comparison results: zero if the comparison is false, and all 1's
+///    if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ps(a, b, c) \
+  ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                                (__v4sf)(__m128)(b), (c)))
+
 /// Compares each of the corresponding double-precision values of two
 ///    256-bit vectors of [4 x double], using the operation specified by the
 ///    immediate integer operand.
@@ -1714,6 +1843,124 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
   ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
                                    (__v8sf)(__m256)(b), (c)))
 
+/// Compares each of the corresponding scalar double-precision values of
+///    two 128-bit vectors of [2 x double], using the operation specified by the
+///    immediate integer operand.
+///
+///    If the result is true, all 64 bits of the destination vector are set;
+///    otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
+#define _mm_cmp_sd(a, b, c) \
+  ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                 (__v2df)(__m128d)(b), (c)))
+
+/// Compares each of the corresponding scalar values of two 128-bit
+///    vectors of [4 x float], using the operation specified by the immediate
+///    integer operand.
+///
+///    If the result is true, all 32 bits of the destination vector are set;
+///    otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ss(a, b, c) \
+  ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                                (__v4sf)(__m128)(b), (c)))
+
 /// Takes a [8 x i32] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
 ///
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 20cd1d4d732fca..8de2864b110653 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4742,127 +4742,6 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
   return (__m128d)__a;
 }
 
-/// Compares each of the corresponding double-precision values of two
-///    128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand.
-///
-///    Returns a [2 x double] vector consisting of two doubles corresponding to
-///    the two comparison results: zero if the comparison is false, and all 1's
-///    if the comparison is true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double].
-/// \param b
-///    A 128-bit vector of [2 x double].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    (Note that without avx enabled, only bits [2:0] are supported) \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_pd(a, b, c)                                                    \
-  ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
-                                 (c)))
-
-/// Compares each of the corresponding scalar double-precision values of
-///    two 128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand.
-///
-///    If the result is true, all 64 bits of the destination vector are set;
-///    otherwise they are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [2 x double].
-/// \param b
-///    A 128-bit vector of [2 x double].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    (Note that without avx enabled, only bits [2:0] are supported) \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_sd(a, b, c)                                                    \
-  ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
-                                 (c)))
-
 #if defined(__cplusplus)
 extern "C" {
 #endif
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 6894672ef0529b..a786689d391773 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -153,9 +153,164 @@ module _Builtin_intrinsics [system] [extern_c] {
   }
 }
 
-module _Builtin_stddef_max_align_t [system] [extern_c] {
-  header "__stddef_max_align_t.h"
+// Start -fbuiltin-headers-in-system-modules affected modules
+
+// The following modules all ignore their top level headers
+// when -fbuiltin-headers-in-system-modules is passed, and
+// most of those headers join system modules when present.
+
+// e.g. if -fbuiltin-headers-in-system-modules is passed, then
+// float.h will not be in the _Builtin_float module (that module
+// will be empty). If there is a system module that declares
+// `header "float.h"`, then the builtin float.h will join
+// that module. The system float.h (if present) will be treated
+// as a textual header in the sytem module.
+module _Builtin_float [system] {
+  header "float.h"
+  export *
+}
+
+module _Builtin_inttypes [system] {
+  header "inttypes.h"
+  export *
+}
+
+module _Builtin_iso646 [system] {
+  header "iso646.h"
+  export *
+}
+
+module _Builtin_limits [system] {
+  header "limits.h"
+  export *
+}
+
+module _Builtin_stdalign [system] {
+  header "stdalign.h"
+  export *
+}
+
+// When -fbuiltin-headers-in-system-modules is passed, only
+// the top level headers are removed, the implementation headers
+// will always be in their submodules. That means when stdarg.h
+// is included, it will still import this module and make the
+// appropriate submodules visible.
+module _Builtin_stdarg [system] {
+  textual header "stdarg.h"
+
+  explicit module __gnuc_va_list {
+    header "__stdarg___gnuc_va_list.h"
+    export *
+  }
+
+  explicit module __va_copy {
+    header "__stdarg___va_copy.h"
+    export *
+  }
+
+  explicit module va_arg {
+    header "__stdarg_va_arg.h"
+    export *
+  }
+
+  explicit module va_copy {
+    header "__stdarg_va_copy.h"
+    export *
+  }
+
+  explicit module va_list {
+    header "__stdarg_va_list.h"
+    export *
+  }
+}
+
+module _Builtin_stdatomic [system] {
+  header "stdatomic.h"
+  export *
+}
+
+module _Builtin_stdbool [system] {
+  header "stdbool.h"
+  export *
+}
+
+module _Builtin_stddef [system] {
+  textual header "stddef.h"
+
+  explicit module max_align_t {
+    header "__stddef_max_align_t.h"
+    export *
+  }
+
+  explicit module null {
+    header "__stddef_null.h"
+    export *
+  }
+
+  explicit module nullptr_t {
+    header "__stddef_nullptr_t.h"
+    export *
+  }
+
+  explicit module offsetof {
+    header "__stddef_offsetof.h"
+    export *
+  }
+
+  explicit module ptrdiff_t {
+    header "__stddef_ptrdiff_t.h"
+    export *
+  }
+
+  explicit module rsize_t {
+    header "__stddef_rsize_t.h"
+    export *
+  }
+
+  explicit module size_t {
+    header "__stddef_size_t.h"
+    export *
+  }
+
+  explicit module unreachable {
+    header "__stddef_unreachable.h"
+    export *
+  }
+
+  explicit module wchar_t {
+    header "__stddef_wchar_t.h"
+    export *
+  }
+}
+
+/* wint_t is provided by <wchar.h> and not <stddef.h>. It's here
+ * for compatibility, but must be explicitly requested. Therefore
+ * __stddef_wint_t.h is not part of _Builtin_stddef. */
+module _Builtin_stddef_wint_t [system] {
+  header "__stddef_wint_t.h"
+  export *
+}
+
+module _Builtin_stdint [system] {
+  header "stdint.h"
+  export *
+}
+
+module _Builtin_stdnoreturn [system] {
+  header "stdnoreturn.h"
+  export *
+}
+
+module _Builtin_tgmath [system] {
+  header "tgmath.h"
+  export *
+}
+
+module _Builtin_unwind [system] {
+  header "unwind.h"
+  export *
 }
+// End -fbuiltin-headers-in-system-modules affected modules
 
 module opencl_c {
   requires opencl
diff --git a/clang/lib/Headers/stdarg.h b/clang/lib/Headers/stdarg.h
index 085e2bb861ceb5..94b066566f084e 100644
--- a/clang/lib/Headers/stdarg.h
+++ b/clang/lib/Headers/stdarg.h
@@ -7,9 +7,31 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(__STDARG_H) || defined(__need___va_list) ||                       \
-    defined(__need_va_list) || defined(__need_va_arg) ||                       \
-    defined(__need___va_copy) || defined(__need_va_copy)
+/*
+ * This header is designed to be included multiple times. If any of the __need_
+ * macros are defined, then only that subset of interfaces are provided. This
+ * can be useful for POSIX headers that need to not expose all of stdarg.h, but
+ * need to use some of its interfaces. Otherwise this header provides all of
+ * the expected interfaces.
+ *
+ * When clang modules are enabled, this header is a textual header. It ignores
+ * its header guard so that multiple submodules can export its interfaces.
+ * Take module SM with submodules A and B, whose headers both include stdarg.h
+ * When SM.A builds, __STDARG_H will be defined. When SM.B builds, the
+ * definition from SM.A will leak when building without local submodule
+ * visibility. stdarg.h wouldn't include any of its implementation headers, and
+ * SM.B wouldn't import any of the stdarg modules, and SM.B's `export *`
+ * wouldn't export any stdarg interfaces as expected. However, since stdarg.h
+ * ignores its header guard when building with modules, it all works as
+ * expected.
+ *
+ * When clang modules are not enabled, the header guards can function in the
+ * normal simple fashion.
+ */
+#if !defined(__STDARG_H) || __has_feature(modules) ||                          \
+    defined(__need___va_list) || defined(__need_va_list) ||                    \
+    defined(__need_va_arg) || defined(__need___va_copy) ||                     \
+    defined(__need_va_copy)
 
 #if !defined(__need___va_list) && !defined(__need_va_list) &&                  \
     !defined(__need_va_arg) && !defined(__need___va_copy) &&                   \
diff --git a/clang/lib/Headers/stddef.h b/clang/lib/Headers/stddef.h
index 9c6dd0cd62dae1..e0ad7b8d17aff9 100644
--- a/clang/lib/Headers/stddef.h
+++ b/clang/lib/Headers/stddef.h
@@ -7,23 +7,41 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
-    defined(__need_size_t) || defined(__need_rsize_t) ||                       \
-    defined(__need_wchar_t) || defined(__need_NULL) ||                         \
-    defined(__need_nullptr_t) || defined(__need_unreachable) ||                \
-    defined(__need_max_align_t) || defined(__need_offsetof) ||                 \
-    defined(__need_wint_t) ||                                                  \
-    (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1)
+/*
+ * This header is designed to be included multiple times. If any of the __need_
+ * macros are defined, then only that subset of interfaces are provided. This
+ * can be useful for POSIX headers that need to not expose all of stddef.h, but
+ * need to use some of its interfaces. Otherwise this header provides all of
+ * the expected interfaces.
+ *
+ * When clang modules are enabled, this header is a textual header. It ignores
+ * its header guard so that multiple submodules can export its interfaces.
+ * Take module SM with submodules A and B, whose headers both include stddef.h
+ * When SM.A builds, __STDDEF_H will be defined. When SM.B builds, the
+ * definition from SM.A will leak when building without local submodule
+ * visibility. stddef.h wouldn't include any of its implementation headers, and
+ * SM.B wouldn't import any of the stddef modules, and SM.B's `export *`
+ * wouldn't export any stddef interfaces as expected. However, since stddef.h
+ * ignores its header guard when building with modules, it all works as
+ * expected.
+ *
+ * When clang modules are not enabled, the header guards can function in the
+ * normal simple fashion.
+ */
+#if !defined(__STDDEF_H) || __has_feature(modules) ||                          \
+    (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1) ||        \
+    defined(__need_ptrdiff_t) || defined(__need_size_t) ||                     \
+    defined(__need_rsize_t) || defined(__need_wchar_t) ||                      \
+    defined(__need_NULL) || defined(__need_nullptr_t) ||                       \
+    defined(__need_unreachable) || defined(__need_max_align_t) ||              \
+    defined(__need_offsetof) || defined(__need_wint_t)
 
 #if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
     !defined(__need_rsize_t) && !defined(__need_wchar_t) &&                    \
     !defined(__need_NULL) && !defined(__need_nullptr_t) &&                     \
     !defined(__need_unreachable) && !defined(__need_max_align_t) &&            \
     !defined(__need_offsetof) && !defined(__need_wint_t)
-/* Always define miscellaneous pieces when modules are available. */
-#if !__has_feature(modules)
 #define __STDDEF_H
-#endif
 #define __need_ptrdiff_t
 #define __need_size_t
 /* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 73b1ab4bb4a3d3..2d1d33c69f0bbd 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -2936,134 +2936,6 @@ _mm_movemask_ps(__m128 __a)
   return __builtin_ia32_movmskps((__v4sf)__a);
 }
 
-/* Compare */
-#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
-#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
-#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
-#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
-#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
-#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
-#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
-#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
-
-/// Compares each of the corresponding values of two 128-bit vectors of
-///    [4 x float], using the operation specified by the immediate integer
-///    operand.
-///
-///    Returns a [4 x float] vector consisting of four floats corresponding to
-///    the four comparison results: zero if the comparison is false, and all 1's
-///    if the comparison is true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float].
-/// \param b
-///    A 128-bit vector of [4 x float].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    (Note that without avx enabled, only bits [2:0] are supported) \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ps(a, b, c)                                                    \
-  ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
-
-/// Compares each of the corresponding scalar values of two 128-bit
-///    vectors of [4 x float], using the operation specified by the immediate
-///    integer operand.
-///
-///    If the result is true, all 32 bits of the destination vector are set;
-///    otherwise they are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
-///
-/// \param a
-///    A 128-bit vector of [4 x float].
-/// \param b
-///    A 128-bit vector of [4 x float].
-/// \param c
-///    An immediate integer operand, with bits [4:0] specifying which comparison
-///    operation to use: \n
-///    (Note that without avx enabled, only bits [2:0] are supported) \n
-///    0x00: Equal (ordered, non-signaling) \n
-///    0x01: Less-than (ordered, signaling) \n
-///    0x02: Less-than-or-equal (ordered, signaling) \n
-///    0x03: Unordered (non-signaling) \n
-///    0x04: Not-equal (unordered, non-signaling) \n
-///    0x05: Not-less-than (unordered, signaling) \n
-///    0x06: Not-less-than-or-equal (unordered, signaling) \n
-///    0x07: Ordered (non-signaling) \n
-///    0x08: Equal (unordered, non-signaling) \n
-///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
-///    0x0A: Not-greater-than (unordered, signaling) \n
-///    0x0B: False (ordered, non-signaling) \n
-///    0x0C: Not-equal (ordered, non-signaling) \n
-///    0x0D: Greater-than-or-equal (ordered, signaling) \n
-///    0x0E: Greater-than (ordered, signaling) \n
-///    0x0F: True (unordered, non-signaling) \n
-///    0x10: Equal (ordered, signaling) \n
-///    0x11: Less-than (ordered, non-signaling) \n
-///    0x12: Less-than-or-equal (ordered, non-signaling) \n
-///    0x13: Unordered (signaling) \n
-///    0x14: Not-equal (unordered, signaling) \n
-///    0x15: Not-less-than (unordered, non-signaling) \n
-///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-///    0x17: Ordered (signaling) \n
-///    0x18: Equal (unordered, signaling) \n
-///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-///    0x1A: Not-greater-than (unordered, non-signaling) \n
-///    0x1B: False (ordered, signaling) \n
-///    0x1C: Not-equal (ordered, signaling) \n
-///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-///    0x1E: Greater-than (ordered, non-signaling) \n
-///    0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ss(a, b, c)                                                    \
-  ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
 
 #define _MM_ALIGN16 __attribute__((aligned(16)))
 
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index e8437572ebf4bf..f65a5f145c0439 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -1551,8 +1551,6 @@ namespace clang {
     /// (or the end of the file).
     void skipUntil(MMToken::TokenKind K);
 
-    using ModuleId = SmallVector<std::pair<std::string, SourceLocation>, 2>;
-
     bool parseModuleId(ModuleId &Id);
     void parseModuleDecl();
     void parseExternModuleDecl();
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 73c37bc5161c4b..c16dd75fbbd3f5 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -998,6 +998,18 @@ void Preprocessor::Lex(Token &Result) {
   }
 }
 
+void Preprocessor::LexTokensUntilEOF(std::vector<Token> *Tokens) {
+  while (1) {
+    Token Tok;
+    Lex(Tok);
+    if (Tok.isOneOf(tok::unknown, tok::eof, tok::eod,
+                    tok::annot_repl_input_end))
+      break;
+    if (Tokens != nullptr)
+      Tokens->push_back(Tok);
+  }
+}
+
 /// Lex a header-name token (including one formed from header-name-tokens if
 /// \p AllowConcatenation is \c true).
 ///
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 9bda4ec7d59bbf..bcc70c04dec91b 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -4038,7 +4038,7 @@ void Parser::ParseDeclarationSpecifiers(
       isStorageClass = true;
       break;
     case tok::kw_auto:
-      if (getLangOpts().CPlusPlus11) {
+      if (getLangOpts().CPlusPlus11 || getLangOpts().C23) {
         if (isKnownToBeTypeSpecifier(GetLookAheadToken(1))) {
           isInvalid = DS.SetStorageClassSpec(Actions, DeclSpec::SCS_auto, Loc,
                                              PrevSpec, DiagID, Policy);
@@ -5009,7 +5009,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
 
       BaseRange = SourceRange(ColonLoc, DeclaratorInfo.getSourceRange().getEnd());
 
-      if (!getLangOpts().ObjC) {
+      if (!getLangOpts().ObjC && !getLangOpts().C23) {
         if (getLangOpts().CPlusPlus11)
           Diag(ColonLoc, diag::warn_cxx98_compat_enum_fixed_underlying_type)
               << BaseRange;
@@ -5835,8 +5835,7 @@ bool Parser::isDeclarationSpecifier(
 bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
                                      DeclSpec::FriendSpecified IsFriend,
                                      const ParsedTemplateInfo *TemplateInfo) {
-  TentativeParsingAction TPA(*this);
-
+  RevertingTentativeParsingAction TPA(*this);
   // Parse the C++ scope specifier.
   CXXScopeSpec SS;
   if (TemplateInfo && TemplateInfo->TemplateParams)
@@ -5845,7 +5844,6 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
   if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr,
                                      /*ObjectHasErrors=*/false,
                                      /*EnteringContext=*/true)) {
-    TPA.Revert();
     return false;
   }
 
@@ -5857,7 +5855,6 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
   } else if (Tok.is(tok::annot_template_id)) {
     ConsumeAnnotationToken();
   } else {
-    TPA.Revert();
     return false;
   }
 
@@ -5867,7 +5864,6 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
 
   // Current class name must be followed by a left parenthesis.
   if (Tok.isNot(tok::l_paren)) {
-    TPA.Revert();
     return false;
   }
   ConsumeParen();
@@ -5876,7 +5872,6 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
   // that we have a constructor.
   if (Tok.is(tok::r_paren) ||
       (Tok.is(tok::ellipsis) && NextToken().is(tok::r_paren))) {
-    TPA.Revert();
     return true;
   }
 
@@ -5885,7 +5880,6 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
   if (getLangOpts().CPlusPlus11 &&
       isCXX11AttributeSpecifier(/*Disambiguate*/ false,
                                 /*OuterMightBeMessageSend*/ true)) {
-    TPA.Revert();
     return true;
   }
 
@@ -5906,9 +5900,17 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
   // If we parsed a scope specifier as well as friend,
   // we might be parsing a friend constructor.
   bool IsConstructor = false;
-  if (isDeclarationSpecifier(IsFriend && !SS.isSet()
-                                 ? ImplicitTypenameContext::No
-                                 : ImplicitTypenameContext::Yes))
+  ImplicitTypenameContext ITC = IsFriend && !SS.isSet()
+                                    ? ImplicitTypenameContext::No
+                                    : ImplicitTypenameContext::Yes;
+  // Constructors cannot have this parameters, but we support that scenario here
+  // to improve diagnostic.
+  if (Tok.is(tok::kw_this)) {
+    ConsumeToken();
+    return isDeclarationSpecifier(ITC);
+  }
+
+  if (isDeclarationSpecifier(ITC))
     IsConstructor = true;
   else if (Tok.is(tok::identifier) ||
            (Tok.is(tok::annot_cxxscope) && NextToken().is(tok::identifier))) {
@@ -5977,8 +5979,6 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide,
       break;
     }
   }
-
-  TPA.Revert();
   return IsConstructor;
 }
 
@@ -7356,6 +7356,7 @@ void Parser::ParseFunctionDeclaratorIdentifierList(
 ///           '=' assignment-expression
 /// [GNU]   declaration-specifiers abstract-declarator[opt] attributes
 /// [C++11] attribute-specifier-seq parameter-declaration
+/// [C++2b] attribute-specifier-seq 'this' parameter-declaration
 ///
 void Parser::ParseParameterDeclarationClause(
     DeclaratorContext DeclaratorCtx, ParsedAttributes &FirstArgAttrs,
@@ -7420,9 +7421,16 @@ void Parser::ParseParameterDeclarationClause(
 
     SourceLocation DSStart = Tok.getLocation();
 
+    // Parse a C++23 Explicit Object Parameter
+    // We do that in all language modes to produce a better diagnostic.
+    SourceLocation ThisLoc;
+    if (getLangOpts().CPlusPlus && Tok.is(tok::kw_this))
+      ThisLoc = ConsumeToken();
+
     ParseDeclarationSpecifiers(DS, /*TemplateInfo=*/ParsedTemplateInfo(),
                                AS_none, DeclSpecContext::DSC_normal,
                                /*LateAttrs=*/nullptr, AllowImplicitTypename);
+
     DS.takeAttributesFrom(ArgDeclSpecAttrs);
 
     // Parse the declarator.  This is "PrototypeContext" or
@@ -7436,6 +7444,9 @@ void Parser::ParseParameterDeclarationClause(
                                   : DeclaratorContext::Prototype);
     ParseDeclarator(ParmDeclarator);
 
+    if (ThisLoc.isValid())
+      ParmDeclarator.SetRangeBegin(ThisLoc);
+
     // Parse GNU attributes, if present.
     MaybeParseGNUAttributes(ParmDeclarator);
     if (getLangOpts().HLSL)
@@ -7505,7 +7516,8 @@ void Parser::ParseParameterDeclarationClause(
       }
       // Inform the actions module about the parameter declarator, so it gets
       // added to the current scope.
-      Decl *Param = Actions.ActOnParamDeclarator(getCurScope(), ParmDeclarator);
+      Decl *Param =
+          Actions.ActOnParamDeclarator(getCurScope(), ParmDeclarator, ThisLoc);
       // Parse the default argument, if any. We parse the default
       // arguments in all dialects; the semantic analysis in
       // ActOnParamDefaultArgument will reject the default argument in
@@ -7691,7 +7703,7 @@ void Parser::ParseBracketDeclarator(Declarator &D) {
     // Parse the constant-expression or assignment-expression now (depending
     // on dialect).
     if (getLangOpts().CPlusPlus) {
-      NumElements = ParseConstantExpression();
+      NumElements = ParseArrayBoundExpression();
     } else {
       EnterExpressionEvaluationContext Unevaluated(
           Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 711990bff2851e..9dbfc1c8c5e9ff 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -221,6 +221,15 @@ ExprResult Parser::ParseConstantExpression() {
   return ParseConstantExpressionInExprEvalContext(NotTypeCast);
 }
 
+ExprResult Parser::ParseArrayBoundExpression() {
+  EnterExpressionEvaluationContext ConstantEvaluated(
+      Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  // If we parse the bound of a VLA... we parse a non-constant
+  // constant-expression!
+  Actions.ExprEvalContexts.back().InConditionallyConstantEvaluateContext = true;
+  return ParseConstantExpressionInExprEvalContext(NotTypeCast);
+}
+
 ExprResult Parser::ParseCaseExpression(SourceLocation CaseLoc) {
   EnterExpressionEvaluationContext ConstantEvaluated(
       Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 8a8a126bf7244d..995834a78c795f 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3416,6 +3416,17 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_ompx_attribute:
     Clause = ParseOpenMPOMPXAttributesClause(WrongDirective);
     break;
+  case OMPC_ompx_bare:
+    if (WrongDirective)
+      Diag(Tok, diag::note_ompx_bare_clause)
+          << getOpenMPClauseName(CKind) << "target teams";
+    if (!ErrorFound && !getLangOpts().OpenMPExtensions) {
+      Diag(Tok, diag::err_omp_unexpected_clause_extension_only)
+          << getOpenMPClauseName(CKind) << getOpenMPDirectiveName(DKind);
+      ErrorFound = true;
+    }
+    Clause = ParseOpenMPClause(CKind, WrongDirective);
+    break;
   default:
     break;
   }
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index 03c19bf5d96e38..8b653b1c4f8eaf 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -1571,6 +1571,17 @@ Parser::isCXXDeclarationSpecifier(ImplicitTypenameContext AllowImplicitTypename,
   case tok::kw___vector:
     return TPResult::True;
 
+  case tok::kw_this: {
+    // Try to parse a C++23 Explicit Object Parameter
+    // We do that in all language modes to produce a better diagnostic.
+    if (getLangOpts().CPlusPlus) {
+      RevertingTentativeParsingAction PA(*this);
+      ConsumeToken();
+      return isCXXDeclarationSpecifier(AllowImplicitTypename, BracedCastResult,
+                                       InvalidAsDeclSpec);
+    }
+    return TPResult::False;
+  }
   case tok::annot_template_id: {
     TemplateIdAnnotation *TemplateId = takeTemplateIdAnnotation(Tok);
     // If lookup for the template-name found nothing, don't assume we have a
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 77bb560eb6288f..0947e8b0f526a3 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -1983,6 +1983,12 @@ class ThreadSafetyReporter : public clang::threadSafety::ThreadSafetyHandler {
         case POK_PtPassByRef:
           DiagID = diag::warn_pt_guarded_pass_by_reference;
           break;
+        case POK_ReturnByRef:
+          DiagID = diag::warn_guarded_return_by_reference;
+          break;
+        case POK_PtReturnByRef:
+          DiagID = diag::warn_pt_guarded_return_by_reference;
+          break;
       }
       PartialDiagnosticAt Warning(Loc, S.PDiag(DiagID) << Kind
                                                        << D
@@ -2013,6 +2019,12 @@ class ThreadSafetyReporter : public clang::threadSafety::ThreadSafetyHandler {
         case POK_PtPassByRef:
           DiagID = diag::warn_pt_guarded_pass_by_reference;
           break;
+        case POK_ReturnByRef:
+          DiagID = diag::warn_guarded_return_by_reference;
+          break;
+        case POK_PtReturnByRef:
+          DiagID = diag::warn_pt_guarded_return_by_reference;
+          break;
       }
       PartialDiagnosticAt Warning(Loc, S.PDiag(DiagID) << Kind
                                                        << D
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index d59778b5b614b7..781f24cb71ae99 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -420,6 +420,18 @@ bool Declarator::isStaticMember() {
               getName().OperatorFunctionId.Operator));
 }
 
+bool Declarator::isExplicitObjectMemberFunction() {
+  if (!isFunctionDeclarator())
+    return false;
+  DeclaratorChunk::FunctionTypeInfo &Fun = getFunctionTypeInfo();
+  if (Fun.NumParams) {
+    auto *P = dyn_cast_or_null<ParmVarDecl>(Fun.Params[0].Param);
+    if (P && P->isExplicitObjectParameter())
+      return true;
+  }
+  return false;
+}
+
 bool Declarator::isCtorOrDtor() {
   return (getName().getKind() == UnqualifiedIdKind::IK_ConstructorName) ||
          (getName().getKind() == UnqualifiedIdKind::IK_DestructorName);
@@ -1363,8 +1375,9 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
     StorageClassSpecLoc = SourceLocation();
   }
   // Diagnose if we've recovered from an ill-formed 'auto' storage class
-  // specifier in a pre-C++11 dialect of C++.
-  if (!S.getLangOpts().CPlusPlus11 && TypeSpecType == TST_auto)
+  // specifier in a pre-C++11 dialect of C++ or in a pre-C23 dialect of C.
+  if (!S.getLangOpts().CPlusPlus11 && !S.getLangOpts().C23 &&
+      TypeSpecType == TST_auto)
     S.Diag(TSTLoc, diag::ext_auto_type_specifier);
   if (S.getLangOpts().CPlusPlus && !S.getLangOpts().CPlusPlus11 &&
       StorageClassSpec == SCS_auto)
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index af24d727c2bdbd..fd86a5f8b49c0a 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -175,8 +175,8 @@ struct BuiltinTypeDeclBuilder {
                                   SourceLocation(), FPOptionsOverride());
 
     CXXThisExpr *This = CXXThisExpr::Create(
-        AST, SourceLocation(),
-        Constructor->getThisObjectType(), true);
+        AST, SourceLocation(), Constructor->getFunctionObjectParameterType(),
+        true);
     Expr *Handle = MemberExpr::CreateImplicit(AST, This, false, Fields["h"],
                                               Fields["h"]->getType(), VK_LValue,
                                               OK_Ordinary);
@@ -260,9 +260,9 @@ struct BuiltinTypeDeclBuilder {
     auto FnProtoLoc = TSInfo->getTypeLoc().getAs<FunctionProtoTypeLoc>();
     FnProtoLoc.setParam(0, IdxParam);
 
-    auto *This = CXXThisExpr::Create(
-        AST, SourceLocation(),
-        MethodDecl->getThisObjectType(), true);
+    auto *This =
+        CXXThisExpr::Create(AST, SourceLocation(),
+                            MethodDecl->getFunctionObjectParameterType(), true);
     auto *HandleAccess = MemberExpr::CreateImplicit(
         AST, This, false, Handle, Handle->getType(), VK_LValue, OK_Ordinary);
 
diff --git a/clang/lib/Sema/ScopeInfo.cpp b/clang/lib/Sema/ScopeInfo.cpp
index 92ce5137f4f311..ce90451f2613bf 100644
--- a/clang/lib/Sema/ScopeInfo.cpp
+++ b/clang/lib/Sema/ScopeInfo.cpp
@@ -248,6 +248,14 @@ void LambdaScopeInfo::visitPotentialCaptures(
   }
 }
 
+bool LambdaScopeInfo::lambdaCaptureShouldBeConst() const {
+  if (ExplicitObjectParameter)
+    return ExplicitObjectParameter->getType()
+        .getNonReferenceType()
+        .isConstQualified();
+  return !Mutable;
+}
+
 FunctionScopeInfo::~FunctionScopeInfo() { }
 BlockScopeInfo::~BlockScopeInfo() { }
 CapturedRegionScopeInfo::~CapturedRegionScopeInfo() { }
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 4489cb0deae4c8..67533ccbdf347c 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -221,7 +221,6 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
       CurScope(nullptr), Ident_super(nullptr) {
   assert(pp.TUKind == TUKind);
   TUScope = nullptr;
-  isConstantEvaluatedOverride = false;
 
   LoadedExternalKnownNamespaces = false;
   for (unsigned I = 0; I != NSAPI::NumNSNumberLiteralMethods; ++I)
@@ -2072,7 +2071,7 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
         targetDiag(D->getLocation(), diag::note_defined_here, FD) << D;
     }
 
-    if (Ty->isRVVType())
+    if (TI.hasRISCVVTypes() && Ty->isRVVType())
       checkRVVTypeSupport(Ty, Loc, D);
 
     // Don't allow SVE types in functions without a SVE target.
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 88f5484575db17..7c4083e4ec4d4b 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -249,6 +249,15 @@ Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
     return CFP_Native;
 
+  // HipStdPar mode is special, in that assessing whether a device side call to
+  // a host target is deferred to a subsequent pass, and cannot unambiguously be
+  // adjudicated in the AST, hence we optimistically allow them to pass here.
+  if (getLangOpts().HIPStdPar &&
+      (CallerTarget == CFT_Global || CallerTarget == CFT_Device ||
+       CallerTarget == CFT_HostDevice) &&
+      CalleeTarget == CFT_Host)
+    return CFP_HostDevice;
+
   // (d) HostDevice behavior depends on compilation mode.
   if (CallerTarget == CFT_HostDevice) {
     // It's OK to call a compilation-mode matching function from an HD one.
@@ -803,7 +812,7 @@ bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
   assert(Callee && "Callee may not be null.");
 
-  auto &ExprEvalCtx = ExprEvalContexts.back();
+  const auto &ExprEvalCtx = currentEvaluationContext();
   if (ExprEvalCtx.isUnevaluated() || ExprEvalCtx.isConstantEvaluated())
     return true;
 
@@ -895,7 +904,7 @@ void Sema::CUDACheckLambdaCapture(CXXMethodDecl *Callee,
   if (!ShouldCheck || !Capture.isReferenceCapture())
     return;
   auto DiagKind = SemaDiagnosticBuilder::K_Deferred;
-  if (Capture.isVariableCapture()) {
+  if (Capture.isVariableCapture() && !getLangOpts().HIPStdPar) {
     SemaDiagnosticBuilder(DiagKind, Capture.getLocation(),
                           diag::err_capture_bad_target, Callee, *this)
         << Capture.getVariable();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 6a3b5fa61d5945..446e35218bff0a 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -126,7 +126,7 @@ static bool checkArgCountAtLeast(Sema &S, CallExpr *Call,
 
   return S.Diag(Call->getEndLoc(), diag::err_typecheck_call_too_few_args)
          << 0 /*function call*/ << MinArgCount << ArgCount
-         << Call->getSourceRange();
+         << /*is non object*/ 0 << Call->getSourceRange();
 }
 
 /// Checks that a call expression's argument count is at most the desired
@@ -139,7 +139,7 @@ static bool checkArgCountAtMost(Sema &S, CallExpr *Call, unsigned MaxArgCount) {
   return S.Diag(Call->getEndLoc(),
                 diag::err_typecheck_call_too_many_args_at_most)
          << 0 /*function call*/ << MaxArgCount << ArgCount
-         << Call->getSourceRange();
+         << /*is non object*/ 0 << Call->getSourceRange();
 }
 
 /// Checks that a call expression's argument count is in the desired range. This
@@ -168,7 +168,7 @@ static bool checkArgCount(Sema &S, CallExpr *Call, unsigned DesiredArgCount) {
 
   return S.Diag(Range.getBegin(), diag::err_typecheck_call_too_many_args)
          << 0 /*function call*/ << DesiredArgCount << ArgCount
-         << Call->getArg(1)->getSourceRange();
+         << /*is non object*/ 0 << Call->getArg(1)->getSourceRange();
 }
 
 static bool convertArgumentToType(Sema &S, Expr *&Value, QualType Ty) {
@@ -217,7 +217,7 @@ static bool SemaBuiltinMSVCAnnotation(Sema &S, CallExpr *TheCall) {
   // We need at least one argument.
   if (TheCall->getNumArgs() < 1) {
     S.Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args_at_least)
-        << 0 << 1 << TheCall->getNumArgs()
+        << 0 << 1 << TheCall->getNumArgs() << /*is non object*/ 0
         << TheCall->getCallee()->getSourceRange();
     return true;
   }
@@ -1076,7 +1076,7 @@ static bool ProcessFormatStringLiteral(const Expr *FormatExpr,
 void Sema::checkFortifiedBuiltinMemoryFunction(FunctionDecl *FD,
                                                CallExpr *TheCall) {
   if (TheCall->isValueDependent() || TheCall->isTypeDependent() ||
-      isConstantEvaluated())
+      isConstantEvaluatedContext())
     return;
 
   bool UseDABAttr = false;
@@ -1578,7 +1578,7 @@ static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
   if (NumArgs < 4) {
     S.Diag(TheCall->getBeginLoc(),
            diag::err_typecheck_call_too_few_args_at_least)
-        << 0 << 4 << NumArgs;
+        << 0 << 4 << NumArgs << /*is non object*/ 0;
     return true;
   }
 
@@ -3192,7 +3192,7 @@ bool Sema::CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
 
 bool Sema::CheckARMCoprocessorImmediate(const TargetInfo &TI,
                                         const Expr *CoprocArg, bool WantCDE) {
-  if (isConstantEvaluated())
+  if (isConstantEvaluatedContext())
     return false;
 
   // We can't check the value of a dependent argument.
@@ -6276,10 +6276,6 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_cmpss:
   case X86::BI__builtin_ia32_cmppd:
   case X86::BI__builtin_ia32_cmpsd:
-    i = 2;
-    l = 0;
-    u = TI.hasFeature("avx") ? 31 : 7;
-    break;
   case X86::BI__builtin_ia32_cmpps256:
   case X86::BI__builtin_ia32_cmppd256:
   case X86::BI__builtin_ia32_cmpps128_mask:
@@ -6616,7 +6612,7 @@ static void CheckNonNullArguments(Sema &S,
   assert((FDecl || Proto) && "Need a function declaration or prototype");
 
   // Already checked by constant evaluator.
-  if (S.isConstantEvaluated())
+  if (S.isConstantEvaluatedContext())
     return;
   // Check the attributes attached to the method/function itself.
   llvm::SmallBitVector NonNullArgs;
@@ -6924,8 +6920,9 @@ void Sema::CheckConstructorCall(FunctionDecl *FDecl, QualType ThisType,
       Proto->isVariadic() ? VariadicConstructor : VariadicDoesNotApply;
 
   auto *Ctor = cast<CXXConstructorDecl>(FDecl);
-  CheckArgAlignment(Loc, FDecl, "'this'", Context.getPointerType(ThisType),
-                    Context.getPointerType(Ctor->getThisObjectType()));
+  CheckArgAlignment(
+      Loc, FDecl, "'this'", Context.getPointerType(ThisType),
+      Context.getPointerType(Ctor->getFunctionObjectParameterType()));
 
   checkCall(FDecl, Proto, /*ThisArg=*/nullptr, Args, /*IsMemberFunction=*/true,
             Loc, SourceRange(), CallType);
@@ -6945,14 +6942,16 @@ bool Sema::CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
   unsigned NumArgs = TheCall->getNumArgs();
 
   Expr *ImplicitThis = nullptr;
-  if (IsMemberOperatorCall && !FDecl->isStatic()) {
+  if (IsMemberOperatorCall && !FDecl->isStatic() &&
+      !FDecl->hasCXXExplicitFunctionObjectParameter()) {
     // If this is a call to a non-static member operator, hide the first
     // argument from checkCall.
     // FIXME: Our choice of AST representation here is less than ideal.
     ImplicitThis = Args[0];
     ++Args;
     --NumArgs;
-  } else if (IsMemberFunction && !FDecl->isStatic())
+  } else if (IsMemberFunction && !FDecl->isStatic() &&
+             !FDecl->hasCXXExplicitFunctionObjectParameter())
     ImplicitThis =
         cast<CXXMemberCallExpr>(TheCall)->getImplicitObjectArgument();
 
@@ -6965,8 +6964,8 @@ bool Sema::CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
       ThisType = Context.getPointerType(ThisType);
     }
 
-    QualType ThisTypeFromDecl =
-        Context.getPointerType(cast<CXXMethodDecl>(FDecl)->getThisObjectType());
+    QualType ThisTypeFromDecl = Context.getPointerType(
+        cast<CXXMethodDecl>(FDecl)->getFunctionObjectParameterType());
 
     CheckArgAlignment(TheCall->getRParenLoc(), FDecl, "'this'", ThisType,
                       ThisTypeFromDecl);
@@ -7293,13 +7292,13 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
   if (Args.size() < AdjustedNumArgs) {
     Diag(CallRange.getEnd(), diag::err_typecheck_call_too_few_args)
         << 0 << AdjustedNumArgs << static_cast<unsigned>(Args.size())
-        << ExprRange;
+        << /*is non object*/ 0 << ExprRange;
     return ExprError();
   } else if (Args.size() > AdjustedNumArgs) {
     Diag(Args[AdjustedNumArgs]->getBeginLoc(),
          diag::err_typecheck_call_too_many_args)
         << 0 << AdjustedNumArgs << static_cast<unsigned>(Args.size())
-        << ExprRange;
+        << /*is non object*/ 0 << ExprRange;
     return ExprError();
   }
 
@@ -7662,7 +7661,8 @@ bool Sema::BuiltinWasmRefNullExtern(CallExpr *TheCall) {
 bool Sema::BuiltinWasmRefNullFunc(CallExpr *TheCall) {
   if (TheCall->getNumArgs() != 0) {
     Diag(TheCall->getBeginLoc(), diag::err_typecheck_call_too_many_args)
-         << 0 /*function call*/ << 0 << TheCall->getNumArgs();
+        << 0 /*function call*/ << /*expected*/ 0 << TheCall->getNumArgs()
+        << /*is non object*/ 0;
     return true;
   }
 
@@ -7695,7 +7695,8 @@ Sema::SemaBuiltinAtomicOverloaded(ExprResult TheCallResult) {
   // Ensure that we have at least one argument to do type inference from.
   if (TheCall->getNumArgs() < 1) {
     Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args_at_least)
-        << 0 << 1 << TheCall->getNumArgs() << Callee->getSourceRange();
+        << 0 << 1 << TheCall->getNumArgs() << /*is non object*/ 0
+        << Callee->getSourceRange();
     return ExprError();
   }
 
@@ -7971,7 +7972,7 @@ Sema::SemaBuiltinAtomicOverloaded(ExprResult TheCallResult) {
   // have at least that many.
   if (TheCall->getNumArgs() < 1+NumFixed) {
     Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args_at_least)
-        << 0 << 1 + NumFixed << TheCall->getNumArgs()
+        << 0 << 1 + NumFixed << TheCall->getNumArgs() << /*is non object*/ 0
         << Callee->getSourceRange();
     return ExprError();
   }
@@ -8361,7 +8362,8 @@ bool Sema::SemaBuiltinVAStartARMMicrosoft(CallExpr *Call) {
   if (Call->getNumArgs() < 3)
     return Diag(Call->getEndLoc(),
                 diag::err_typecheck_call_too_few_args_at_least)
-           << 0 /*function call*/ << 3 << Call->getNumArgs();
+           << 0 /*function call*/ << 3 << Call->getNumArgs()
+           << /*is non object*/ 0;
 
   // Type-check the first argument normally.
   if (checkBuiltinArgument(*this, Call, 0))
@@ -8622,7 +8624,7 @@ ExprResult Sema::SemaBuiltinShuffleVector(CallExpr *TheCall) {
     return ExprError(Diag(TheCall->getEndLoc(),
                           diag::err_typecheck_call_too_few_args_at_least)
                      << 0 /*function call*/ << 2 << TheCall->getNumArgs()
-                     << TheCall->getSourceRange());
+                     << /*is non object*/ 0 << TheCall->getSourceRange());
 
   // Determine which of the following types of shufflevector we're checking:
   // 1) unary, vector mask: (lhs, mask)
@@ -8742,7 +8744,8 @@ bool Sema::SemaBuiltinPrefetch(CallExpr *TheCall) {
   if (NumArgs > 3)
     return Diag(TheCall->getEndLoc(),
                 diag::err_typecheck_call_too_many_args_at_most)
-           << 0 /*function call*/ << 3 << NumArgs << TheCall->getSourceRange();
+           << 0 /*function call*/ << 3 << NumArgs << /*is non object*/ 0
+           << TheCall->getSourceRange();
 
   // Argument 0 is checked for us and the remaining arguments must be
   // constant integers.
@@ -8881,13 +8884,13 @@ bool Sema::SemaBuiltinOSLogFormat(CallExpr *TheCall) {
   if (NumArgs < NumRequiredArgs) {
     return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args)
            << 0 /* function call */ << NumRequiredArgs << NumArgs
-           << TheCall->getSourceRange();
+           << /*is non object*/ 0 << TheCall->getSourceRange();
   }
   if (NumArgs >= NumRequiredArgs + 0x100) {
     return Diag(TheCall->getEndLoc(),
                 diag::err_typecheck_call_too_many_args_at_most)
            << 0 /* function call */ << (NumRequiredArgs + 0xff) << NumArgs
-           << TheCall->getSourceRange();
+           << /*is non object*/ 0 << TheCall->getSourceRange();
   }
   unsigned i = 0;
 
@@ -8973,7 +8976,7 @@ bool Sema::SemaBuiltinConstantArg(CallExpr *TheCall, int ArgNum,
 /// TheCall is a constant expression in the range [Low, High].
 bool Sema::SemaBuiltinConstantArgRange(CallExpr *TheCall, int ArgNum,
                                        int Low, int High, bool RangeIsError) {
-  if (isConstantEvaluated())
+  if (isConstantEvaluatedContext())
     return false;
   llvm::APSInt Result;
 
@@ -9687,7 +9690,7 @@ checkFormatStringExpr(Sema &S, const Expr *E, ArrayRef<const Expr *> Args,
                       llvm::SmallBitVector &CheckedVarArgs,
                       UncoveredArgHandler &UncoveredArg, llvm::APSInt Offset,
                       bool IgnoreStringsWithoutSpecifiers = false) {
-  if (S.isConstantEvaluated())
+  if (S.isConstantEvaluatedContext())
     return SLCT_NotALiteral;
 tryAgain:
   assert(Offset.isSigned() && "invalid offset");
@@ -9727,8 +9730,8 @@ checkFormatStringExpr(Sema &S, const Expr *E, ArrayRef<const Expr *> Args,
     bool CheckLeft = true, CheckRight = true;
 
     bool Cond;
-    if (C->getCond()->EvaluateAsBooleanCondition(Cond, S.getASTContext(),
-                                                 S.isConstantEvaluated())) {
+    if (C->getCond()->EvaluateAsBooleanCondition(
+            Cond, S.getASTContext(), S.isConstantEvaluatedContext())) {
       if (Cond)
         CheckRight = false;
       else
@@ -9992,9 +9995,11 @@ checkFormatStringExpr(Sema &S, const Expr *E, ArrayRef<const Expr *> Args,
       Expr::EvalResult LResult, RResult;
 
       bool LIsInt = BinOp->getLHS()->EvaluateAsInt(
-          LResult, S.Context, Expr::SE_NoSideEffects, S.isConstantEvaluated());
+          LResult, S.Context, Expr::SE_NoSideEffects,
+          S.isConstantEvaluatedContext());
       bool RIsInt = BinOp->getRHS()->EvaluateAsInt(
-          RResult, S.Context, Expr::SE_NoSideEffects, S.isConstantEvaluated());
+          RResult, S.Context, Expr::SE_NoSideEffects,
+          S.isConstantEvaluatedContext());
 
       if (LIsInt != RIsInt) {
         BinaryOperatorKind BinOpKind = BinOp->getOpcode();
@@ -10022,7 +10027,7 @@ checkFormatStringExpr(Sema &S, const Expr *E, ArrayRef<const Expr *> Args,
       Expr::EvalResult IndexResult;
       if (ASE->getRHS()->EvaluateAsInt(IndexResult, S.Context,
                                        Expr::SE_NoSideEffects,
-                                       S.isConstantEvaluated())) {
+                                       S.isConstantEvaluatedContext())) {
         sumOffsets(Offset, IndexResult.Val.getInt(), BO_Add,
                    /*RHS is int*/ true);
         E = ASE->getBase();
@@ -11349,12 +11354,15 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
       ImplicitMatch == ArgType::NoMatchTypeConfusion)
     Match = ImplicitMatch;
   assert(Match != ArgType::MatchPromotion);
+
   // Look through unscoped enums to their underlying type.
   bool IsEnum = false;
   bool IsScopedEnum = false;
+  QualType IntendedTy = ExprTy;
   if (auto EnumTy = ExprTy->getAs<EnumType>()) {
+    IntendedTy = EnumTy->getDecl()->getIntegerType();
     if (EnumTy->isUnscopedEnumerationType()) {
-      ExprTy = EnumTy->getDecl()->getIntegerType();
+      ExprTy = IntendedTy;
       // This controls whether we're talking about the underlying type or not,
       // which we only want to do when it's an unscoped enum.
       IsEnum = true;
@@ -11366,7 +11374,6 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
   // %C in an Objective-C context prints a unichar, not a wchar_t.
   // If the argument is an integer of some kind, believe the %C and suggest
   // a cast instead of changing the conversion specifier.
-  QualType IntendedTy = ExprTy;
   if (isObjCContext() &&
       FS.getConversionSpecifier().getKind() == ConversionSpecifier::CArg) {
     if (ExprTy->isIntegralOrUnscopedEnumerationType() &&
@@ -11402,8 +11409,10 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
     std::tie(CastTy, CastTyName) = shouldNotPrintDirectly(S.Context, IntendedTy, E);
     if (!CastTy.isNull()) {
       // %zi/%zu and %td/%tu are OK to use for NSInteger/NSUInteger of type int
-      // (long in ASTContext). Only complain to pedants.
-      if ((CastTyName == "NSInteger" || CastTyName == "NSUInteger") &&
+      // (long in ASTContext). Only complain to pedants or when they're the
+      // underlying type of a scoped enum (which always needs a cast).
+      if (!IsScopedEnum &&
+          (CastTyName == "NSInteger" || CastTyName == "NSUInteger") &&
           (AT.isSizeT() || AT.isPtrdiffT()) &&
           AT.matchesType(S.Context, CastTy))
         Match = ArgType::NoMatchPedantic;
@@ -11458,20 +11467,15 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
       // should be printed as 'long' for 64-bit compatibility.)
       // Rather than emitting a normal format/argument mismatch, we want to
       // add a cast to the recommended type (and correct the format string
-      // if necessary).
+      // if necessary). We should also do so for scoped enumerations.
       SmallString<16> CastBuf;
       llvm::raw_svector_ostream CastFix(CastBuf);
       CastFix << (S.LangOpts.CPlusPlus ? "static_cast<" : "(");
-      if (IsScopedEnum) {
-        CastFix << AT.getRepresentativeType(S.Context).getAsString(
-            S.Context.getPrintingPolicy());
-      } else {
-        IntendedTy.print(CastFix, S.Context.getPrintingPolicy());
-      }
+      IntendedTy.print(CastFix, S.Context.getPrintingPolicy());
       CastFix << (S.LangOpts.CPlusPlus ? ">" : ")");
 
       SmallVector<FixItHint,4> Hints;
-      if ((!AT.matchesType(S.Context, IntendedTy) && !IsScopedEnum) ||
+      if (AT.matchesType(S.Context, IntendedTy) != ArgType::Match ||
           ShouldNotPrintDirectly)
         Hints.push_back(FixItHint::CreateReplacement(SpecRange, os.str()));
 
@@ -11499,7 +11503,7 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
         Hints.push_back(FixItHint::CreateInsertion(After, ")"));
       }
 
-      if (ShouldNotPrintDirectly) {
+      if (ShouldNotPrintDirectly && !IsScopedEnum) {
         // The expression has a type that should not be printed directly.
         // We extract the name from the typedef because we don't want to show
         // the underlying type in the diagnostic.
@@ -13953,7 +13957,7 @@ static bool CheckTautologicalComparison(Sema &S, BinaryOperator *E,
     return false;
 
   IntRange OtherValueRange = GetExprRange(
-      S.Context, Other, S.isConstantEvaluated(), /*Approximate*/ false);
+      S.Context, Other, S.isConstantEvaluatedContext(), /*Approximate=*/false);
 
   QualType OtherT = Other->getType();
   if (const auto *AT = OtherT->getAs<AtomicType>())
@@ -14168,8 +14172,9 @@ static void AnalyzeComparison(Sema &S, BinaryOperator *E) {
   }
 
   // Otherwise, calculate the effective range of the signed operand.
-  IntRange signedRange = GetExprRange(
-      S.Context, signedOperand, S.isConstantEvaluated(), /*Approximate*/ true);
+  IntRange signedRange =
+      GetExprRange(S.Context, signedOperand, S.isConstantEvaluatedContext(),
+                   /*Approximate=*/true);
 
   // Go ahead and analyze implicit conversions in the operands.  Note
   // that we skip the implicit conversions on both sides.
@@ -14187,8 +14192,8 @@ static void AnalyzeComparison(Sema &S, BinaryOperator *E) {
   if (E->isEqualityOp()) {
     unsigned comparisonWidth = S.Context.getIntWidth(T);
     IntRange unsignedRange =
-        GetExprRange(S.Context, unsignedOperand, S.isConstantEvaluated(),
-                     /*Approximate*/ true);
+        GetExprRange(S.Context, unsignedOperand, S.isConstantEvaluatedContext(),
+                     /*Approximate=*/true);
 
     // We should never be unable to prove that the unsigned operand is
     // non-negative.
@@ -15051,7 +15056,7 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
     if (Target->isUnsaturatedFixedPointType()) {
       Expr::EvalResult Result;
       if (E->EvaluateAsFixedPoint(Result, S.Context, Expr::SE_AllowSideEffects,
-                                  S.isConstantEvaluated())) {
+                                  S.isConstantEvaluatedContext())) {
         llvm::APFixedPoint Value = Result.Val.getFixedPoint();
         llvm::APFixedPoint MaxVal = S.Context.getFixedPointMax(T);
         llvm::APFixedPoint MinVal = S.Context.getFixedPointMin(T);
@@ -15066,7 +15071,7 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
       }
     } else if (Target->isIntegerType()) {
       Expr::EvalResult Result;
-      if (!S.isConstantEvaluated() &&
+      if (!S.isConstantEvaluatedContext() &&
           E->EvaluateAsFixedPoint(Result, S.Context,
                                   Expr::SE_AllowSideEffects)) {
         llvm::APFixedPoint FXResult = Result.Val.getFixedPoint();
@@ -15089,7 +15094,7 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   } else if (Target->isUnsaturatedFixedPointType()) {
     if (Source->isIntegerType()) {
       Expr::EvalResult Result;
-      if (!S.isConstantEvaluated() &&
+      if (!S.isConstantEvaluatedContext() &&
           E->EvaluateAsInt(Result, S.Context, Expr::SE_AllowSideEffects)) {
         llvm::APSInt Value = Result.Val.getInt();
 
@@ -15115,8 +15120,9 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   if (SourceBT && TargetBT && SourceBT->isIntegerType() &&
       TargetBT->isFloatingType() && !IsListInit) {
     // Determine the number of precision bits in the source integer type.
-    IntRange SourceRange = GetExprRange(S.Context, E, S.isConstantEvaluated(),
-                                        /*Approximate*/ true);
+    IntRange SourceRange =
+        GetExprRange(S.Context, E, S.isConstantEvaluatedContext(),
+                     /*Approximate=*/true);
     unsigned int SourcePrecision = SourceRange.Width;
 
     // Determine the number of precision bits in the
@@ -15184,8 +15190,8 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
 
   IntRange SourceTypeRange =
       IntRange::forTargetOfCanonicalType(S.Context, Source);
-  IntRange LikelySourceRange =
-      GetExprRange(S.Context, E, S.isConstantEvaluated(), /*Approximate*/ true);
+  IntRange LikelySourceRange = GetExprRange(
+      S.Context, E, S.isConstantEvaluatedContext(), /*Approximate=*/true);
   IntRange TargetRange = IntRange::forTargetOfCanonicalType(S.Context, Target);
 
   if (LikelySourceRange.Width > TargetRange.Width) {
@@ -15193,7 +15199,7 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
     // TODO: this should happen for bitfield stores, too.
     Expr::EvalResult Result;
     if (E->EvaluateAsInt(Result, S.Context, Expr::SE_AllowSideEffects,
-                         S.isConstantEvaluated())) {
+                         S.isConstantEvaluatedContext())) {
       llvm::APSInt Value(32);
       Value = Result.Val.getInt();
 
@@ -16057,7 +16063,8 @@ class SequenceChecker : public ConstEvaluatedExprVisitor<SequenceChecker> {
       if (!EvalOK || E->isValueDependent())
         return false;
       EvalOK = E->EvaluateAsBooleanCondition(
-          Result, Self.SemaRef.Context, Self.SemaRef.isConstantEvaluated());
+          Result, Self.SemaRef.Context,
+          Self.SemaRef.isConstantEvaluatedContext());
       return EvalOK;
     }
 
@@ -17184,7 +17191,7 @@ void Sema::CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
                             const ArraySubscriptExpr *ASE,
                             bool AllowOnePastEnd, bool IndexNegated) {
   // Already diagnosed by the constant evaluator.
-  if (isConstantEvaluated())
+  if (isConstantEvaluatedContext())
     return;
 
   IndexExpr = IndexExpr->IgnoreParenImpCasts();
@@ -18573,7 +18580,7 @@ void Sema::CheckArgumentWithTypeTag(const ArgumentWithTypeTagAttr *Attr,
   TypeTagData TypeInfo;
   if (!GetMatchingCType(ArgumentKind, TypeTagExpr, Context,
                         TypeTagForDatatypeMagicValues.get(), FoundWrongKind,
-                        TypeInfo, isConstantEvaluated())) {
+                        TypeInfo, isConstantEvaluatedContext())) {
     if (FoundWrongKind)
       Diag(TypeTagExpr->getExprLoc(),
            diag::warn_type_tag_for_datatype_wrong_kind)
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 036548b68247bf..0ef03293b46ffb 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -702,8 +702,7 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
   }
 
   ContextRAII SavedContext{*this, CtxToSave};
-  LocalInstantiationScope Scope(*this, !ForOverloadResolution ||
-                                           isLambdaCallOperator(FD));
+  LocalInstantiationScope Scope(*this, !ForOverloadResolution);
   std::optional<MultiLevelTemplateArgumentList> MLTAL =
       SetupConstraintCheckingTemplateArgumentsAndScope(
           const_cast<FunctionDecl *>(FD), {}, Scope);
@@ -720,7 +719,8 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
   CXXThisScopeRAII ThisScope(*this, Record, ThisQuals, Record != nullptr);
 
   LambdaScopeForCallOperatorInstantiationRAII LambdaScope(
-      *this, const_cast<FunctionDecl *>(FD), *MLTAL, Scope);
+      *this, const_cast<FunctionDecl *>(FD), *MLTAL, Scope,
+      ForOverloadResolution);
 
   return CheckConstraintSatisfaction(
       FD, {FD->getTrailingRequiresClause()}, *MLTAL,
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index cfbe176d8300c2..d2b0922a4bb9c4 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -70,7 +70,7 @@ static QualType lookupPromiseType(Sema &S, const FunctionDecl *FD,
   // If the function is a non-static member function, add the type
   // of the implicit object parameter before the formal parameters.
   if (auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
-    if (MD->isInstance()) {
+    if (MD->isImplicitObjectMemberFunction()) {
       // [over.match.funcs]4
       // For non-static member functions, the type of the implicit object
       // parameter is
@@ -78,7 +78,7 @@ static QualType lookupPromiseType(Sema &S, const FunctionDecl *FD,
       //      ref-qualifier or with the & ref-qualifier
       //  -- "rvalue reference to cv X" for functions declared with the &&
       //      ref-qualifier
-      QualType T = MD->getThisObjectType();
+      QualType T = MD->getFunctionObjectParameterType();
       T = FnType->getRefQualifier() == RQ_RValue
               ? S.Context.getRValueReferenceType(T)
               : S.Context.getLValueReferenceType(T, /*SpelledAsLValue*/ true);
@@ -564,10 +564,10 @@ VarDecl *Sema::buildCoroutinePromise(SourceLocation Loc) {
   assert(isa<FunctionDecl>(CurContext) && "not in a function scope");
   auto *FD = cast<FunctionDecl>(CurContext);
   bool IsThisDependentType = [&] {
-    if (auto *MD = dyn_cast_or_null<CXXMethodDecl>(FD))
-      return MD->isInstance() && MD->getThisObjectType()->isDependentType();
-    else
-      return false;
+    if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(FD))
+      return MD->isImplicitObjectMemberFunction() &&
+             MD->getThisType()->isDependentType();
+    return false;
   }();
 
   QualType T = FD->getType()->isDependentType() || IsThisDependentType
@@ -592,7 +592,7 @@ VarDecl *Sema::buildCoroutinePromise(SourceLocation Loc) {
 
   // Add implicit object parameter.
   if (auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
-    if (MD->isInstance() && !isLambdaCallOperator(MD)) {
+    if (MD->isImplicitObjectMemberFunction() && !isLambdaCallOperator(MD)) {
       ExprResult ThisExpr = ActOnCXXThis(Loc);
       if (ThisExpr.isInvalid())
         return nullptr;
@@ -1367,7 +1367,7 @@ bool CoroutineStmtBuilder::makeReturnOnAllocFailure() {
 static bool collectPlacementArgs(Sema &S, FunctionDecl &FD, SourceLocation Loc,
                                  SmallVectorImpl<Expr *> &PlacementArgs) {
   if (auto *MD = dyn_cast<CXXMethodDecl>(&FD)) {
-    if (MD->isInstance() && !isLambdaCallOperator(MD)) {
+    if (MD->isImplicitObjectMemberFunction() && !isLambdaCallOperator(MD)) {
       ExprResult ThisExpr = S.ActOnCXXThis(Loc);
       if (ThisExpr.isInvalid())
         return false;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 9c2f1e83ed3fbe..324a3674732aa0 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8924,13 +8924,11 @@ bool Sema::AddOverriddenMethods(CXXRecordDecl *DC, CXXMethodDecl *MD) {
       CXXMethodDecl *BaseMD =
           dyn_cast<CXXMethodDecl>(BaseND->getCanonicalDecl());
       if (!BaseMD || !BaseMD->isVirtual() ||
-          IsOverload(MD, BaseMD, /*UseMemberUsingDeclRules=*/false,
-                     /*ConsiderCudaAttrs=*/true,
-                     // C++2a [class.virtual]p2 does not consider requires
-                     // clauses when overriding.
-                     /*ConsiderRequiresClauses=*/false))
+          IsOverride(MD, BaseMD, /*UseMemberUsingDeclRules=*/false,
+                     /*ConsiderCudaAttrs=*/true))
+        continue;
+      if (!CheckExplicitObjectOverride(MD, BaseMD))
         continue;
-
       if (Overridden.insert(BaseMD).second) {
         MD->addOverriddenMethod(BaseMD);
         CheckOverridingFunctionReturnType(MD, BaseMD);
@@ -9265,6 +9263,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
   }
   Expr *TrailingRequiresClause = D.getTrailingRequiresClause();
 
+  SemaRef.CheckExplicitObjectMemberFunction(DC, D, Name, R);
+
   if (Name.getNameKind() == DeclarationName::CXXConstructorName) {
     // This is a C++ constructor declaration.
     assert(DC->isRecord() &&
@@ -9766,8 +9766,9 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
       << DeclSpec::getSpecifierName(TSCS);
 
   if (D.isFirstDeclarationOfMember())
-    adjustMemberFunctionCC(R, D.isStaticMember(), D.isCtorOrDtor(),
-                           D.getIdentifierLoc());
+    adjustMemberFunctionCC(
+        R, !(D.isStaticMember() || D.isExplicitObjectMemberFunction()),
+        D.isCtorOrDtor(), D.getIdentifierLoc());
 
   bool isFriend = false;
   FunctionTemplateDecl *FunctionTemplate = nullptr;
@@ -11971,7 +11972,7 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
       // struct B { struct Y { ~Y(); }; using X = Y; };
       // template struct A<B>;
       if (NewFD->getFriendObjectKind() == Decl::FriendObjectKind::FOK_None ||
-          !Destructor->getThisObjectType()->isDependentType()) {
+          !Destructor->getFunctionObjectParameterType()->isDependentType()) {
         CXXRecordDecl *Record = Destructor->getParent();
         QualType ClassType = Context.getTypeDeclType(Record);
 
@@ -12862,6 +12863,15 @@ QualType Sema::deduceVarTypeFromInitializer(VarDecl *VDecl,
   DeducedType *Deduced = Type->getContainedDeducedType();
   assert(Deduced && "deduceVarTypeFromInitializer for non-deduced type");
 
+  // Diagnose auto array declarations in C23, unless it's a supported extension.
+  if (getLangOpts().C23 && Type->isArrayType() &&
+      !isa_and_present<StringLiteral, InitListExpr>(Init)) {
+      Diag(Range.getBegin(), diag::err_auto_not_allowed)
+          << (int)Deduced->getContainedAutoType()->getKeyword()
+          << /*in array decl*/ 23 << Range;
+    return QualType();
+  }
+
   // C++11 [dcl.spec.auto]p3
   if (!Init) {
     assert(VDecl && "no init for init capture deduction?");
@@ -14918,9 +14928,32 @@ void Sema::CheckFunctionOrTemplateParamDeclarator(Scope *S, Declarator &D) {
   }
 }
 
+static void CheckExplicitObjectParameter(Sema &S, ParmVarDecl *P,
+                                         SourceLocation ExplicitThisLoc) {
+  if (!ExplicitThisLoc.isValid())
+    return;
+  assert(S.getLangOpts().CPlusPlus &&
+         "explicit parameter in non-cplusplus mode");
+  if (!S.getLangOpts().CPlusPlus23)
+    S.Diag(ExplicitThisLoc, diag::err_cxx20_deducing_this)
+        << P->getSourceRange();
+
+  // C++2b [dcl.fct/7] An explicit object parameter shall not be a function
+  // parameter pack.
+  if (P->isParameterPack()) {
+    S.Diag(P->getBeginLoc(), diag::err_explicit_object_parameter_pack)
+        << P->getSourceRange();
+    return;
+  }
+  P->setExplicitObjectParameterLoc(ExplicitThisLoc);
+  if (LambdaScopeInfo *LSI = S.getCurLambda())
+    LSI->ExplicitObjectParameter = P;
+}
+
 /// ActOnParamDeclarator - Called from Parser::ParseFunctionDeclarator()
 /// to introduce parameters into function prototype scope.
-Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D) {
+Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D,
+                                 SourceLocation ExplicitThisLoc) {
   const DeclSpec &DS = D.getDeclSpec();
 
   // Verify C99 6.7.5.3p2: The only SCS allowed is 'register'.
@@ -14998,6 +15031,8 @@ Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D) {
   if (D.isInvalidType())
     New->setInvalidDecl();
 
+  CheckExplicitObjectParameter(*this, New, ExplicitThisLoc);
+
   assert(S->isFunctionPrototypeScope());
   assert(S->getFunctionPrototypeDepth() >= 1);
   New->setScopeInfo(S->getFunctionPrototypeDepth() - 1,
@@ -15395,6 +15430,8 @@ LambdaScopeInfo *Sema::RebuildLambdaScopeInfo(CXXMethodDecl *CallOperator) {
 
   LSI->IntroducerRange = DNI.getCXXOperatorNameRange();
   LSI->Mutable = !CallOperator->isConst();
+  if (CallOperator->isExplicitObjectMemberFunction())
+    LSI->ExplicitObjectParameter = CallOperator->getParamDecl(0);
 
   // Add the captures to the LSI so they can be noted as already
   // captured within tryCaptureVar.
@@ -18048,6 +18085,18 @@ void Sema::ActOnTagFinishDefinition(Scope *S, Decl *TagD,
                      [](const FieldDecl *FD) { return FD->isBitField(); }))
       Diag(BraceRange.getBegin(), diag::warn_pragma_align_not_xl_compatible);
   }
+
+  // Check the "counted_by" attribute to ensure that the count field exists in
+  // the struct.
+  if (const auto *RD = dyn_cast<RecordDecl>(Tag)) {
+    auto Pred = [](const Decl *D) {
+      if (const auto *FD = dyn_cast<FieldDecl>(D))
+        return FD->hasAttr<CountedByAttr>();
+      return false;
+    };
+    if (const FieldDecl *FD = RD->findFieldIf(Pred))
+      CheckCountedByAttr(S, FD);
+  }
 }
 
 void Sema::ActOnObjCContainerFinishDefinition() {
@@ -18121,7 +18170,8 @@ ExprResult Sema::VerifyBitField(SourceLocation FieldLoc,
 
   // Zero-width bitfield is ok for anonymous field.
   if (Value == 0 && FieldName)
-    return Diag(FieldLoc, diag::err_bitfield_has_zero_width) << FieldName;
+    return Diag(FieldLoc, diag::err_bitfield_has_zero_width)
+           << FieldName << BitWidth->getSourceRange();
 
   if (Value.isSigned() && Value.isNegative()) {
     if (FieldName)
@@ -18175,8 +18225,8 @@ ExprResult Sema::VerifyBitField(SourceLocation FieldLoc,
 /// to create a FieldDecl object for it.
 Decl *Sema::ActOnField(Scope *S, Decl *TagD, SourceLocation DeclStart,
                        Declarator &D, Expr *BitfieldWidth) {
-  FieldDecl *Res = HandleField(S, cast_or_null<RecordDecl>(TagD),
-                               DeclStart, D, static_cast<Expr*>(BitfieldWidth),
+  FieldDecl *Res = HandleField(S, cast_if_present<RecordDecl>(TagD), DeclStart,
+                               D, BitfieldWidth,
                                /*InitStyle=*/ICIS_NoInit, AS_public);
   return Res;
 }
@@ -18622,6 +18672,9 @@ Decl *Sema::ActOnIvar(Scope *S, SourceLocation DeclStart, Declarator &D,
   ObjCIvarDecl *NewID = ObjCIvarDecl::Create(
       Context, EnclosingContext, DeclStart, Loc, II, T, TInfo, ac, BitWidth);
 
+  if (T->containsErrors())
+    NewID->setInvalidDecl();
+
   if (II) {
     NamedDecl *PrevDecl = LookupSingleName(S, II, Loc, LookupMemberName,
                                            ForVisibleRedeclaration);
@@ -18781,10 +18834,13 @@ static bool AreSpecialMemberFunctionsSameKind(ASTContext &Context,
   if (CSM == Sema::CXXDefaultConstructor)
     return bool(M1->getDescribedFunctionTemplate()) ==
            bool(M2->getDescribedFunctionTemplate());
-  if (!Context.hasSameType(M1->getParamDecl(0)->getType(),
-                           M2->getParamDecl(0)->getType()))
+  // FIXME: better resolve CWG
+  // https://cplusplus.github.io/CWG/issues/2787.html
+  if (!Context.hasSameType(M1->getNonObjectParameter(0)->getType(),
+                           M2->getNonObjectParameter(0)->getType()))
     return false;
-  if (!Context.hasSameType(M1->getThisType(), M2->getThisType()))
+  if (!Context.hasSameType(M1->getFunctionObjectParameterReferenceType(),
+                           M2->getFunctionObjectParameterReferenceType()))
     return false;
 
   return true;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2d7e69946a39e4..47e072ace072b4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1234,7 +1234,7 @@ static void handleConsumableAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
 static bool checkForConsumableClass(Sema &S, const CXXMethodDecl *MD,
                                     const ParsedAttr &AL) {
-  QualType ThisType = MD->getThisObjectType();
+  QualType ThisType = MD->getFunctionObjectParameterType();
 
   if (const CXXRecordDecl *RD = ThisType->getAsCXXRecordDecl()) {
     if (!RD->hasAttr<ConsumableAttr>()) {
@@ -1336,23 +1336,23 @@ static void handleReturnTypestateAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   // FIXME: This check is currently being done in the analysis.  It can be
   //        enabled here only after the parser propagates attributes at
   //        template specialization definition, not declaration.
-  //QualType ReturnType;
+  // QualType ReturnType;
   //
-  //if (const ParmVarDecl *Param = dyn_cast<ParmVarDecl>(D)) {
+  // if (const ParmVarDecl *Param = dyn_cast<ParmVarDecl>(D)) {
   //  ReturnType = Param->getType();
   //
   //} else if (const CXXConstructorDecl *Constructor =
   //             dyn_cast<CXXConstructorDecl>(D)) {
-  //  ReturnType = Constructor->getThisObjectType();
+  //  ReturnType = Constructor->getFunctionObjectParameterType();
   //
   //} else {
   //
   //  ReturnType = cast<FunctionDecl>(D)->getCallResultType();
   //}
   //
-  //const CXXRecordDecl *RD = ReturnType->getAsCXXRecordDecl();
+  // const CXXRecordDecl *RD = ReturnType->getAsCXXRecordDecl();
   //
-  //if (!RD || !RD->hasAttr<ConsumableAttr>()) {
+  // if (!RD || !RD->hasAttr<ConsumableAttr>()) {
   //    S.Diag(Attr.getLoc(), diag::warn_return_state_for_unconsumable_type) <<
   //      ReturnType.getAsString();
   //    return;
@@ -8361,6 +8361,134 @@ static void handleZeroCallUsedRegsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(ZeroCallUsedRegsAttr::Create(S.Context, Kind, AL));
 }
 
+static void handleCountedByAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  if (!AL.isArgIdent(0)) {
+    S.Diag(AL.getLoc(), diag::err_attribute_argument_type)
+        << AL << AANT_ArgumentIdentifier;
+    return;
+  }
+
+  IdentifierLoc *IL = AL.getArgAsIdent(0);
+  CountedByAttr *CBA =
+      ::new (S.Context) CountedByAttr(S.Context, AL, IL->Ident);
+  CBA->setCountedByFieldLoc(IL->Loc);
+  D->addAttr(CBA);
+}
+
+namespace {
+
+// Callback to only accept typo corrections that are for field members of
+// the given struct or union.
+class FieldDeclValidatorCCC final : public CorrectionCandidateCallback {
+public:
+  explicit FieldDeclValidatorCCC(const RecordDecl *RD) : Record(RD) {}
+
+  bool ValidateCandidate(const TypoCorrection &candidate) override {
+    FieldDecl *FD = candidate.getCorrectionDeclAs<FieldDecl>();
+    return FD && FD->getDeclContext()->getRedeclContext()->Equals(Record);
+  }
+
+  std::unique_ptr<CorrectionCandidateCallback> clone() override {
+    return std::make_unique<FieldDeclValidatorCCC>(*this);
+  }
+
+private:
+  const RecordDecl *Record;
+};
+
+} // end anonymous namespace
+
+bool Sema::CheckCountedByAttr(Scope *S, const FieldDecl *FD) {
+  const RecordDecl *RD = FD->getParent();
+  const auto *CBA = FD->getAttr<CountedByAttr>();
+  const IdentifierInfo *FieldName = CBA->getCountedByField();
+
+  auto Pred = [&](const Decl *D) {
+    if (const auto *Field = dyn_cast<FieldDecl>(D))
+      return Field->getName() == FieldName->getName();
+    return false;
+  };
+  const FieldDecl *Field = RD->findFieldIf(Pred);
+
+  if (!Field) {
+    // The "counted_by" field needs to exist within the struct.
+    DeclarationNameInfo NameInfo(FieldName,
+                                 CBA->getCountedByFieldLoc().getBegin());
+    LookupResult Result(*this, NameInfo, Sema::LookupOrdinaryName);
+
+    LookupName(Result, S);
+    if (Result.getResultKind() == LookupResult::Found) {
+      SourceRange SR = CBA->getCountedByFieldLoc();
+      Diag(SR.getBegin(),
+           diag::err_flexible_array_counted_by_attr_field_not_found_in_struct)
+          << CBA->getCountedByField() << SR;
+
+      SR = Result.getAsSingle<NamedDecl>()->getSourceRange();
+      Diag(SR.getBegin(), diag::note_var_declared_here)
+          << Result.getAsSingle<NamedDecl>() << SR;
+    } else {
+      SourceRange SR = CBA->getCountedByFieldLoc();
+      FieldDeclValidatorCCC CCC(RD);
+
+      if (TypoCorrection Corrected =
+              CorrectTypo(NameInfo, Sema::LookupMemberName, /*Scope=*/nullptr,
+                          /*SS=*/nullptr, CCC, Sema::CTK_ErrorRecovery,
+                          const_cast<RecordDecl *>(RD))) {
+        diagnoseTypo(
+            Corrected,
+            PDiag(
+                diag::
+                    err_flexible_array_counted_by_attr_field_not_found_suggest)
+                << FieldName);
+      } else {
+        Diag(SR.getBegin(),
+             diag::err_flexible_array_counted_by_attr_field_not_found)
+            << CBA->getCountedByField() << SR;
+      }
+    }
+
+    return true;
+  }
+
+  LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
+      Context.getLangOpts().getStrictFlexArraysLevel();
+
+  if (!Decl::isFlexibleArrayMemberLike(Context, FD, FD->getType(),
+                                       StrictFlexArraysLevel, true)) {
+    // The "counted_by" attribute must be on a flexible array member.
+    SourceRange SR = FD->getLocation();
+    Diag(SR.getBegin(),
+         diag::err_counted_by_attr_not_on_flexible_array_member)
+        << SR;
+    return true;
+  }
+
+  if (Field->hasAttr<CountedByAttr>()) {
+    // The "counted_by" field can't point to the flexible array member.
+    SourceRange SR = CBA->getCountedByFieldLoc();
+    Diag(SR.getBegin(),
+         diag::err_flexible_array_counted_by_attr_refers_to_self)
+        << CBA->getCountedByField() << SR;
+    return true;
+  }
+
+  if (!Field->getType()->isIntegerType() ||
+      Field->getType()->isBooleanType()) {
+    // The "counted_by" field must have an integer type.
+    SourceRange SR = Field->getLocation();
+    Diag(SR.getBegin(),
+         diag::err_flexible_array_counted_by_attr_field_not_integer)
+        << Field << SR;
+
+    SR = CBA->getCountedByFieldLoc();
+    Diag(SR.getBegin(), diag::note_flexible_array_counted_by_attr_field)
+        << CBA->getCountedByField() << SR;
+    return true;
+  }
+
+  return false;
+}
+
 static void handleFunctionReturnThunksAttr(Sema &S, Decl *D,
                                            const ParsedAttr &AL) {
   StringRef KindStr;
@@ -9314,6 +9442,10 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
     handleAvailableOnlyInDefaultEvalMethod(S, D, AL);
     break;
 
+  case ParsedAttr::AT_CountedBy:
+    handleCountedByAttr(S, D, AL);
+    break;
+
   // Microsoft attributes:
   case ParsedAttr::AT_LayoutVersion:
     handleLayoutVersion(S, D, AL);
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 302e944d5d74f1..f9c010b1a00248 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -49,6 +49,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include <map>
 #include <optional>
@@ -7692,7 +7693,7 @@ bool Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD,
   unsigned ExpectedParams = 1;
   if (CSM == CXXDefaultConstructor || CSM == CXXDestructor)
     ExpectedParams = 0;
-  if (MD->getNumParams() != ExpectedParams) {
+  if (MD->getNumExplicitParams() != ExpectedParams) {
     // This checks for default arguments: a copy or move constructor with a
     // default argument is classified as a default constructor, and assignment
     // operations and destructors can't have default arguments.
@@ -7721,10 +7722,12 @@ bool Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD,
   if (CSM == CXXCopyAssignment || CSM == CXXMoveAssignment) {
     // Check for return type matching.
     ReturnType = Type->getReturnType();
+    QualType ThisType = MD->getFunctionObjectParameterType();
 
     QualType DeclType = Context.getTypeDeclType(RD);
     DeclType = Context.getElaboratedType(ETK_None, nullptr, DeclType, nullptr);
-    DeclType = Context.getAddrSpaceQualType(DeclType, MD->getMethodQualifiers().getAddressSpace());
+    DeclType = Context.getAddrSpaceQualType(
+        DeclType, ThisType.getQualifiers().getAddressSpace());
     QualType ExpectedReturnType = Context.getLValueReferenceType(DeclType);
 
     if (!Context.hasSameType(ReturnType, ExpectedReturnType)) {
@@ -7734,7 +7737,7 @@ bool Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD,
     }
 
     // A defaulted special member cannot have cv-qualifiers.
-    if (Type->getMethodQuals().hasConst() || Type->getMethodQuals().hasVolatile()) {
+    if (ThisType.isConstQualified() || ThisType.isVolatileQualified()) {
       if (DeleteOnTypeMismatch)
         ShouldDeleteForTypeMismatch = true;
       else {
@@ -7746,7 +7749,10 @@ bool Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD,
   }
 
   // Check for parameter type matching.
-  QualType ArgType = ExpectedParams ? Type->getParamType(0) : QualType();
+  QualType ArgType =
+      ExpectedParams
+          ? Type->getParamType(MD->isExplicitObjectMemberFunction() ? 1 : 0)
+          : QualType();
   bool HasConstParam = false;
   if (ExpectedParams && ArgType->isReferenceType()) {
     // Argument must be reference to possibly-const T.
@@ -7814,10 +7820,17 @@ bool Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD,
                                   : isa<CXXConstructorDecl>(MD))) &&
       MD->isConstexpr() && !Constexpr &&
       MD->getTemplatedKind() == FunctionDecl::TK_NonTemplate) {
-    Diag(MD->getBeginLoc(), MD->isConsteval()
-                                ? diag::err_incorrect_defaulted_consteval
-                                : diag::err_incorrect_defaulted_constexpr)
-        << CSM;
+        if (!MD->isConsteval() && RD->getNumVBases()) {
+          Diag(MD->getBeginLoc(), diag::err_incorrect_defaulted_constexpr_with_vb)
+              << CSM;
+          for (const auto &I : RD->vbases())
+            Diag(I.getBeginLoc(), diag::note_constexpr_virtual_base_here);
+        } else {
+          Diag(MD->getBeginLoc(), MD->isConsteval()
+                                      ? diag::err_incorrect_defaulted_consteval
+                                      : diag::err_incorrect_defaulted_constexpr)
+              << CSM;
+        }
     // FIXME: Explain why the special member can't be constexpr.
     HadError = true;
   }
@@ -7840,8 +7853,8 @@ bool Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD,
       FunctionProtoType::ExtProtoInfo EPI = Type->getExtProtoInfo();
       EPI.ExceptionSpec.Type = EST_Unevaluated;
       EPI.ExceptionSpec.SourceDecl = MD;
-      MD->setType(Context.getFunctionType(
-          ReturnType, llvm::ArrayRef(&ArgType, ExpectedParams), EPI));
+      MD->setType(
+          Context.getFunctionType(ReturnType, Type->getParamTypes(), EPI));
     }
   }
 
@@ -8492,7 +8505,8 @@ class DefaultedComparisonSynthesizer
   ExprPair getCompleteObject() {
     unsigned Param = 0;
     ExprResult LHS;
-    if (isa<CXXMethodDecl>(FD)) {
+    if (const auto *MD = dyn_cast<CXXMethodDecl>(FD);
+        MD && MD->isImplicitObjectMemberFunction()) {
       // LHS is '*this'.
       LHS = S.ActOnCXXThis(Loc);
       if (!LHS.isInvalid())
@@ -8798,15 +8812,22 @@ bool Sema::CheckExplicitlyDefaultedComparison(Scope *S, FunctionDecl *FD,
     // If we're out-of-class, this is the class we're comparing.
     if (!RD)
       RD = MD->getParent();
-
-    if (!MD->isConst()) {
-      SourceLocation InsertLoc;
-      if (FunctionTypeLoc Loc = MD->getFunctionTypeLoc())
-        InsertLoc = getLocForEndOfToken(Loc.getRParenLoc());
+    QualType T = MD->getFunctionObjectParameterType();
+    if (!T.isConstQualified()) {
+      SourceLocation Loc, InsertLoc;
+      if (MD->isExplicitObjectMemberFunction()) {
+        Loc = MD->getParamDecl(0)->getBeginLoc();
+        InsertLoc = getLocForEndOfToken(
+            MD->getParamDecl(0)->getExplicitObjectParamThisLoc());
+      } else {
+        Loc = MD->getLocation();
+        if (FunctionTypeLoc Loc = MD->getFunctionTypeLoc())
+          InsertLoc = Loc.getRParenLoc();
+      }
       // Don't diagnose an implicit 'operator=='; we will have diagnosed the
       // corresponding defaulted 'operator<=>' already.
       if (!MD->isImplicit()) {
-        Diag(MD->getLocation(), diag::err_defaulted_comparison_non_const)
+        Diag(Loc, diag::err_defaulted_comparison_non_const)
             << (int)DCK << FixItHint::CreateInsertion(InsertLoc, " const");
       }
 
@@ -8830,7 +8851,9 @@ bool Sema::CheckExplicitlyDefaultedComparison(Scope *S, FunctionDecl *FD,
     }
   }
 
-  if (FD->getNumParams() != (IsMethod ? 1 : 2)) {
+  if ((FD->getNumParams() -
+       (unsigned)FD->hasCXXExplicitFunctionObjectParameter()) !=
+      (IsMethod ? 1 : 2)) {
     // Let's not worry about using a variadic template pack here -- who would do
     // such a thing?
     Diag(FD->getLocation(), diag::err_defaulted_comparison_num_args)
@@ -8840,6 +8863,8 @@ bool Sema::CheckExplicitlyDefaultedComparison(Scope *S, FunctionDecl *FD,
 
   const ParmVarDecl *KnownParm = nullptr;
   for (const ParmVarDecl *Param : FD->parameters()) {
+    if (Param->isExplicitObjectParameter())
+      continue;
     QualType ParmTy = Param->getType();
 
     if (!KnownParm) {
@@ -9223,9 +9248,9 @@ struct SpecialMemberVisitor {
       llvm_unreachable("invalid special member kind");
     }
 
-    if (MD->getNumParams()) {
+    if (MD->getNumExplicitParams()) {
       if (const ReferenceType *RT =
-              MD->getParamDecl(0)->getType()->getAs<ReferenceType>())
+              MD->getNonObjectParameter(0)->getType()->getAs<ReferenceType>())
         ConstArg = RT->getPointeeType().isConstQualified();
     }
   }
@@ -10105,7 +10130,7 @@ bool Sema::SpecialMemberIsTrivial(CXXMethodDecl *MD, CXXSpecialMember CSM,
 
   case CXXCopyConstructor:
   case CXXCopyAssignment: {
-    const ParmVarDecl *Param0 = MD->getParamDecl(0);
+    const ParmVarDecl *Param0 = MD->getNonObjectParameter(0);
     const ReferenceType *RT = Param0->getType()->getAs<ReferenceType>();
 
     // When ClangABICompat14 is true, CXX copy constructors will only be trivial
@@ -10136,7 +10161,7 @@ bool Sema::SpecialMemberIsTrivial(CXXMethodDecl *MD, CXXSpecialMember CSM,
   case CXXMoveConstructor:
   case CXXMoveAssignment: {
     // Trivial move operations always have non-cv-qualified parameters.
-    const ParmVarDecl *Param0 = MD->getParamDecl(0);
+    const ParmVarDecl *Param0 = MD->getNonObjectParameter(0);
     const RValueReferenceType *RT =
       Param0->getType()->getAs<RValueReferenceType>();
     if (!RT || RT->getPointeeType().getCVRQualifiers()) {
@@ -11102,15 +11127,25 @@ void Sema::CheckConversionDeclarator(Declarator &D, QualType &R,
         << SourceRange(D.getIdentifierLoc()) << 0;
     D.setInvalidType();
   }
-
   const auto *Proto = R->castAs<FunctionProtoType>();
-
   // Make sure we don't have any parameters.
-  if (Proto->getNumParams() > 0) {
-    Diag(D.getIdentifierLoc(), diag::err_conv_function_with_params);
+  DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
+  unsigned NumParam = Proto->getNumParams();
+
+  // [C++2b]
+  // A conversion function shall have no non-object parameters.
+  if (NumParam == 1) {
+    DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
+    if (const auto *First =
+            dyn_cast_if_present<ParmVarDecl>(FTI.Params[0].Param);
+        First && First->isExplicitObjectParameter())
+      NumParam--;
+  }
 
+  if (NumParam != 0) {
+    Diag(D.getIdentifierLoc(), diag::err_conv_function_with_params);
     // Delete the parameters.
-    D.getFunctionTypeInfo().freeParams();
+    FTI.freeParams();
     D.setInvalidType();
   } else if (Proto->isVariadic()) {
     Diag(D.getIdentifierLoc(), diag::err_conv_function_variadic);
@@ -11268,6 +11303,87 @@ Decl *Sema::ActOnConversionDeclarator(CXXConversionDecl *Conversion) {
   return Conversion;
 }
 
+void Sema::CheckExplicitObjectMemberFunction(DeclContext *DC, Declarator &D,
+                                             DeclarationName Name, QualType R) {
+  CheckExplicitObjectMemberFunction(D, Name, R, false, DC);
+}
+
+void Sema::CheckExplicitObjectLambda(Declarator &D) {
+  CheckExplicitObjectMemberFunction(D, {}, {}, true);
+}
+
+void Sema::CheckExplicitObjectMemberFunction(Declarator &D,
+                                             DeclarationName Name, QualType R,
+                                             bool IsLambda, DeclContext *DC) {
+  if (!D.isFunctionDeclarator())
+    return;
+
+  DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
+  if (FTI.NumParams == 0)
+    return;
+  ParmVarDecl *ExplicitObjectParam = nullptr;
+  for (unsigned Idx = 0; Idx < FTI.NumParams; Idx++) {
+    const auto &ParamInfo = FTI.Params[Idx];
+    if (!ParamInfo.Param)
+      continue;
+    ParmVarDecl *Param = cast<ParmVarDecl>(ParamInfo.Param);
+    if (!Param->isExplicitObjectParameter())
+      continue;
+    if (Idx == 0) {
+      ExplicitObjectParam = Param;
+      continue;
+    } else {
+      Diag(Param->getLocation(),
+           diag::err_explicit_object_parameter_must_be_first)
+          << IsLambda << Param->getSourceRange();
+    }
+  }
+  if (!ExplicitObjectParam)
+    return;
+
+  if (ExplicitObjectParam->hasDefaultArg()) {
+    Diag(ExplicitObjectParam->getLocation(),
+         diag::err_explicit_object_default_arg)
+        << ExplicitObjectParam->getSourceRange();
+  }
+
+  if (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_static) {
+    Diag(ExplicitObjectParam->getBeginLoc(),
+         diag::err_explicit_object_parameter_nonmember)
+        << D.getSourceRange() << /*static=*/0 << IsLambda;
+  }
+
+  if (D.getDeclSpec().isVirtualSpecified()) {
+    Diag(ExplicitObjectParam->getBeginLoc(),
+         diag::err_explicit_object_parameter_nonmember)
+        << D.getSourceRange() << /*virtual=*/1 << IsLambda;
+  }
+
+  if (IsLambda && FTI.hasMutableQualifier()) {
+    Diag(ExplicitObjectParam->getBeginLoc(),
+         diag::err_explicit_object_parameter_mutable)
+        << D.getSourceRange();
+  }
+
+  if (IsLambda)
+    return;
+
+  if (!DC || !DC->isRecord()) {
+    Diag(ExplicitObjectParam->getLocation(),
+         diag::err_explicit_object_parameter_nonmember)
+        << D.getSourceRange() << /*non-member=*/2 << IsLambda;
+    return;
+  }
+
+  // CWG2674: constructors and destructors cannot have explicit parameters.
+  if (Name.getNameKind() == DeclarationName::CXXConstructorName ||
+      Name.getNameKind() == DeclarationName::CXXDestructorName)
+    Diag(ExplicitObjectParam->getBeginLoc(),
+         diag::err_explicit_object_parameter_constructor)
+        << (Name.getNameKind() == DeclarationName::CXXDestructorName)
+        << D.getSourceRange();
+}
+
 namespace {
 /// Utility class to accumulate and print a diagnostic listing the invalid
 /// specifier(s) on a declaration.
@@ -14862,12 +14978,11 @@ void Sema::DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
   SmallVector<Stmt*, 8> Statements;
 
   // The parameter for the "other" object, which we are copying from.
-  ParmVarDecl *Other = CopyAssignOperator->getParamDecl(0);
+  ParmVarDecl *Other = CopyAssignOperator->getNonObjectParameter(0);
   Qualifiers OtherQuals = Other->getType().getQualifiers();
   QualType OtherRefType = Other->getType();
-  if (const LValueReferenceType *OtherRef
-                                = OtherRefType->getAs<LValueReferenceType>()) {
-    OtherRefType = OtherRef->getPointeeType();
+  if (OtherRefType->isLValueReferenceType()) {
+    OtherRefType = OtherRefType->getPointeeType();
     OtherQuals = OtherRefType.getQualifiers();
   }
 
@@ -14879,8 +14994,26 @@ void Sema::DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
   // Builds a DeclRefExpr for the "other" object.
   RefBuilder OtherRef(Other, OtherRefType);
 
-  // Builds the "this" pointer.
-  ThisBuilder This;
+  // Builds the function object parameter.
+  std::optional<ThisBuilder> This;
+  std::optional<DerefBuilder> DerefThis;
+  std::optional<RefBuilder> ExplicitObject;
+  bool IsArrow = false;
+  QualType ObjectType;
+  if (CopyAssignOperator->isExplicitObjectMemberFunction()) {
+    ObjectType = CopyAssignOperator->getParamDecl(0)->getType();
+    if (ObjectType->isReferenceType())
+      ObjectType = ObjectType->getPointeeType();
+    ExplicitObject.emplace(CopyAssignOperator->getParamDecl(0), ObjectType);
+  } else {
+    ObjectType = getCurrentThisType();
+    This.emplace();
+    DerefThis.emplace(*This);
+    IsArrow = !LangOpts.HLSL;
+  }
+  ExprBuilder &ObjectParameter =
+      ExplicitObject ? static_cast<ExprBuilder &>(*ExplicitObject)
+                     : static_cast<ExprBuilder &>(*This);
 
   // Assign base classes.
   bool Invalid = false;
@@ -14902,11 +15035,11 @@ void Sema::DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
                      VK_LValue, BasePath);
 
     // Dereference "this".
-    DerefBuilder DerefThis(This);
-    CastBuilder To(DerefThis,
-                   Context.getQualifiedType(
-                       BaseType, CopyAssignOperator->getMethodQualifiers()),
-                   VK_LValue, BasePath);
+    CastBuilder To(
+        ExplicitObject ? static_cast<ExprBuilder &>(*ExplicitObject)
+                       : static_cast<ExprBuilder &>(*DerefThis),
+        Context.getQualifiedType(BaseType, ObjectType.getQualifiers()),
+        VK_LValue, BasePath);
 
     // Build the copy.
     StmtResult Copy = buildSingleCopyAssign(*this, Loc, BaseType,
@@ -14972,10 +15105,7 @@ void Sema::DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
     MemberLookup.resolveKind();
 
     MemberBuilder From(OtherRef, OtherRefType, /*IsArrow=*/false, MemberLookup);
-
-    MemberBuilder To(This, getCurrentThisType(), /*IsArrow=*/!LangOpts.HLSL,
-                     MemberLookup);
-
+    MemberBuilder To(ObjectParameter, ObjectType, IsArrow, MemberLookup);
     // Build the copy of this field.
     StmtResult Copy = buildSingleCopyAssign(*this, Loc, FieldType,
                                             To, From,
@@ -14992,15 +15122,11 @@ void Sema::DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
 
   if (!Invalid) {
     // Add a "return *this;"
-    Expr *ThisExpr = nullptr;
-    if (!LangOpts.HLSL) {
-      ExprResult ThisObj =
-          CreateBuiltinUnaryOp(Loc, UO_Deref, This.build(*this, Loc));
-      ThisExpr = ThisObj.get();
-    } else {
-      ThisExpr = This.build(*this, Loc);
-    }
-
+    Expr *ThisExpr =
+        (ExplicitObject  ? static_cast<ExprBuilder &>(*ExplicitObject)
+         : LangOpts.HLSL ? static_cast<ExprBuilder &>(*This)
+                         : static_cast<ExprBuilder &>(*DerefThis))
+            .build(*this, Loc);
     StmtResult Return = BuildReturnStmt(Loc, ThisExpr);
     if (Return.isInvalid())
       Invalid = true;
@@ -15231,7 +15357,7 @@ void Sema::DefineImplicitMoveAssignment(SourceLocation CurrentLocation,
   SmallVector<Stmt*, 8> Statements;
 
   // The parameter for the "other" object, which we are move from.
-  ParmVarDecl *Other = MoveAssignOperator->getParamDecl(0);
+  ParmVarDecl *Other = MoveAssignOperator->getNonObjectParameter(0);
   QualType OtherRefType =
       Other->getType()->castAs<RValueReferenceType>()->getPointeeType();
 
@@ -15245,8 +15371,23 @@ void Sema::DefineImplicitMoveAssignment(SourceLocation CurrentLocation,
   // Cast to rvalue.
   MoveCastBuilder MoveOther(OtherRef);
 
-  // Builds the "this" pointer.
-  ThisBuilder This;
+  // Builds the function object parameter.
+  std::optional<ThisBuilder> This;
+  std::optional<DerefBuilder> DerefThis;
+  std::optional<RefBuilder> ExplicitObject;
+  QualType ObjectType;
+  if (MoveAssignOperator->isExplicitObjectMemberFunction()) {
+    ObjectType = MoveAssignOperator->getParamDecl(0)->getType();
+    if (ObjectType->isReferenceType())
+      ObjectType = ObjectType->getPointeeType();
+    ExplicitObject.emplace(MoveAssignOperator->getParamDecl(0), ObjectType);
+  } else {
+    ObjectType = getCurrentThisType();
+    This.emplace();
+    DerefThis.emplace(*This);
+  }
+  ExprBuilder &ObjectParameter =
+      ExplicitObject ? *ExplicitObject : static_cast<ExprBuilder &>(*This);
 
   // Assign base classes.
   bool Invalid = false;
@@ -15274,14 +15415,13 @@ void Sema::DefineImplicitMoveAssignment(SourceLocation CurrentLocation,
     // appropriately-qualified base type.
     CastBuilder From(OtherRef, BaseType, VK_XValue, BasePath);
 
-    // Dereference "this".
-    DerefBuilder DerefThis(This);
-
     // Implicitly cast "this" to the appropriately-qualified base type.
-    CastBuilder To(DerefThis,
-                   Context.getQualifiedType(
-                       BaseType, MoveAssignOperator->getMethodQualifiers()),
-                   VK_LValue, BasePath);
+    // Dereference "this".
+    CastBuilder To(
+        ExplicitObject ? static_cast<ExprBuilder &>(*ExplicitObject)
+                       : static_cast<ExprBuilder &>(*DerefThis),
+        Context.getQualifiedType(BaseType, ObjectType.getQualifiers()),
+        VK_LValue, BasePath);
 
     // Build the move.
     StmtResult Move = buildSingleCopyAssign(*this, Loc, BaseType,
@@ -15346,8 +15486,8 @@ void Sema::DefineImplicitMoveAssignment(SourceLocation CurrentLocation,
     MemberLookup.resolveKind();
     MemberBuilder From(MoveOther, OtherRefType,
                        /*IsArrow=*/false, MemberLookup);
-    MemberBuilder To(This, getCurrentThisType(),
-                     /*IsArrow=*/true, MemberLookup);
+    MemberBuilder To(ObjectParameter, ObjectType, /*IsArrow=*/!ExplicitObject,
+                     MemberLookup);
 
     assert(!From.build(*this, Loc)->isLValue() && // could be xvalue or prvalue
         "Member reference with rvalue base must be rvalue except for reference "
@@ -15369,10 +15509,12 @@ void Sema::DefineImplicitMoveAssignment(SourceLocation CurrentLocation,
 
   if (!Invalid) {
     // Add a "return *this;"
-    ExprResult ThisObj =
-        CreateBuiltinUnaryOp(Loc, UO_Deref, This.build(*this, Loc));
+    Expr *ThisExpr =
+        (ExplicitObject ? static_cast<ExprBuilder &>(*ExplicitObject)
+                        : static_cast<ExprBuilder &>(*DerefThis))
+            .build(*this, Loc);
 
-    StmtResult Return = BuildReturnStmt(Loc, ThisObj.get());
+    StmtResult Return = BuildReturnStmt(Loc, ThisExpr);
     if (Return.isInvalid())
       Invalid = true;
     else
@@ -15691,7 +15833,9 @@ void Sema::DefineImplicitLambdaToFunctionPointerConversion(
   CXXRecordDecl *Lambda = Conv->getParent();
   FunctionDecl *CallOp = Lambda->getLambdaCallOperator();
   FunctionDecl *Invoker =
-      CallOp->isStatic() ? CallOp : Lambda->getLambdaStaticInvoker(CC);
+      CallOp->hasCXXExplicitFunctionObjectParameter() || CallOp->isStatic()
+          ? CallOp
+          : Lambda->getLambdaStaticInvoker(CC);
 
   if (auto *TemplateArgs = Conv->getTemplateSpecializationArgs()) {
     CallOp = InstantiateFunctionDeclaration(
@@ -16297,8 +16441,11 @@ bool Sema::CheckOverloadedOperatorDeclaration(FunctionDecl *FnDecl) {
   //   [...] Operator functions cannot have more or fewer parameters
   //   than the number required for the corresponding operator, as
   //   described in the rest of this subclause.
-  unsigned NumParams = FnDecl->getNumParams()
-                     + (isa<CXXMethodDecl>(FnDecl)? 1 : 0);
+  unsigned NumParams = FnDecl->getNumParams() +
+                       (isa<CXXMethodDecl>(FnDecl) &&
+                                !FnDecl->hasCXXExplicitFunctionObjectParameter()
+                            ? 1
+                            : 0);
   if (Op != OO_Call && Op != OO_Subscript &&
       ((NumParams == 1 && !CanBeUnaryOperator) ||
        (NumParams == 2 && !CanBeBinaryOperator) || (NumParams < 1) ||
@@ -16880,10 +17027,74 @@ Decl *Sema::ActOnStaticAssertDeclaration(SourceLocation StaticAssertLoc,
                                       AssertMessageExpr, RParenLoc, false);
 }
 
+static void WriteCharTypePrefix(BuiltinType::Kind BTK, llvm::raw_ostream &OS) {
+  switch (BTK) {
+  case BuiltinType::Char_S:
+  case BuiltinType::Char_U:
+    break;
+  case BuiltinType::Char8:
+    OS << "u8";
+    break;
+  case BuiltinType::Char16:
+    OS << 'u';
+    break;
+  case BuiltinType::Char32:
+    OS << 'U';
+    break;
+  case BuiltinType::WChar_S:
+  case BuiltinType::WChar_U:
+    OS << 'L';
+    break;
+  default:
+    llvm_unreachable("Non-character type");
+  }
+}
+
+/// Convert character's value, interpreted as a code unit, to a string.
+/// The value needs to be zero-extended to 32-bits.
+/// FIXME: This assumes Unicode literal encodings
+static void WriteCharValueForDiagnostic(uint32_t Value, const BuiltinType *BTy,
+                                        unsigned TyWidth,
+                                        SmallVectorImpl<char> &Str) {
+  char Arr[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
+  char *Ptr = Arr;
+  BuiltinType::Kind K = BTy->getKind();
+  llvm::raw_svector_ostream OS(Str);
+
+  // This should catch Char_S, Char_U, Char8, and use of escaped characters in
+  // other types.
+  if (K == BuiltinType::Char_S || K == BuiltinType::Char_U ||
+      K == BuiltinType::Char8 || Value <= 0x7F) {
+    StringRef Escaped = escapeCStyle<EscapeChar::Single>(Value);
+    if (!Escaped.empty())
+      EscapeStringForDiagnostic(Escaped, Str);
+    else
+      OS << static_cast<char>(Value);
+    return;
+  }
+
+  switch (K) {
+  case BuiltinType::Char16:
+  case BuiltinType::Char32:
+  case BuiltinType::WChar_S:
+  case BuiltinType::WChar_U: {
+    if (llvm::ConvertCodePointToUTF8(Value, Ptr))
+      EscapeStringForDiagnostic(StringRef(Arr, Ptr - Arr), Str);
+    else
+      OS << "\\x"
+         << llvm::format_hex_no_prefix(Value, TyWidth / 4, /*Upper=*/true);
+    break;
+  }
+  default:
+    llvm_unreachable("Non-character type is passed");
+  }
+}
+
 /// Convert \V to a string we can present to the user in a diagnostic
 /// \T is the type of the expression that has been evaluated into \V
 static bool ConvertAPValueToString(const APValue &V, QualType T,
-                                   SmallVectorImpl<char> &Str) {
+                                   SmallVectorImpl<char> &Str,
+                                   ASTContext &Context) {
   if (!V.hasValue())
     return false;
 
@@ -16898,13 +17109,38 @@ static bool ConvertAPValueToString(const APValue &V, QualType T,
              "Bool type, but value is not 0 or 1");
       llvm::raw_svector_ostream OS(Str);
       OS << (BoolValue ? "true" : "false");
-    } else if (T->isCharType()) {
+    } else {
+      llvm::raw_svector_ostream OS(Str);
       // Same is true for chars.
-      Str.push_back('\'');
-      Str.push_back(V.getInt().getExtValue());
-      Str.push_back('\'');
-    } else
+      // We want to print the character representation for textual types
+      const auto *BTy = T->getAs<BuiltinType>();
+      if (BTy) {
+        switch (BTy->getKind()) {
+        case BuiltinType::Char_S:
+        case BuiltinType::Char_U:
+        case BuiltinType::Char8:
+        case BuiltinType::Char16:
+        case BuiltinType::Char32:
+        case BuiltinType::WChar_S:
+        case BuiltinType::WChar_U: {
+          unsigned TyWidth = Context.getIntWidth(T);
+          assert(8 <= TyWidth && TyWidth <= 32 && "Unexpected integer width");
+          uint32_t CodeUnit = static_cast<uint32_t>(V.getInt().getZExtValue());
+          WriteCharTypePrefix(BTy->getKind(), OS);
+          OS << '\'';
+          WriteCharValueForDiagnostic(CodeUnit, BTy, TyWidth, Str);
+          OS << "' (0x"
+             << llvm::format_hex_no_prefix(CodeUnit, /*Width=*/2,
+                                           /*Upper=*/true)
+             << ", " << V.getInt() << ')';
+          return true;
+        }
+        default:
+          break;
+        }
+      }
       V.getInt().toString(Str);
+    }
 
     break;
 
@@ -17001,8 +17237,9 @@ void Sema::DiagnoseStaticAssertDetails(const Expr *E) {
 
       Side->EvaluateAsRValue(DiagSide[I].Result, Context, true);
 
-      DiagSide[I].Print = ConvertAPValueToString(
-          DiagSide[I].Result.Val, Side->getType(), DiagSide[I].ValueString);
+      DiagSide[I].Print =
+          ConvertAPValueToString(DiagSide[I].Result.Val, Side->getType(),
+                                 DiagSide[I].ValueString, Context);
     }
     if (DiagSide[0].Print && DiagSide[1].Print) {
       Diag(Op->getExprLoc(), diag::note_expr_evaluates_to)
@@ -18106,6 +18343,20 @@ bool Sema::CheckOverridingFunctionAttributes(const CXXMethodDecl *New,
   return true;
 }
 
+bool Sema::CheckExplicitObjectOverride(CXXMethodDecl *New,
+                                       const CXXMethodDecl *Old) {
+  // CWG2553
+  // A virtual function shall not be an explicit object member function.
+  if (!New->isExplicitObjectMemberFunction())
+    return true;
+  Diag(New->getParamDecl(0)->getBeginLoc(),
+       diag::err_explicit_object_parameter_nonmember)
+      << New->getSourceRange() << /*virtual*/ 1 << /*IsLambda*/ false;
+  Diag(Old->getLocation(), diag::note_overridden_virtual_function);
+  New->setInvalidDecl();
+  return false;
+}
+
 bool Sema::CheckOverridingFunctionReturnType(const CXXMethodDecl *New,
                                              const CXXMethodDecl *Old) {
   QualType NewTy = New->getType()->castAs<FunctionType>()->getReturnType();
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 72304e51ea8ef3..75730ea888afb4 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -769,14 +769,12 @@ bool Sema::handlerCanCatch(QualType HandlerType, QualType ExceptionType) {
 /// CheckExceptionSpecSubset - Check whether the second function type's
 /// exception specification is a subset (or equivalent) of the first function
 /// type. This is used by override and pointer assignment checks.
-bool Sema::CheckExceptionSpecSubset(const PartialDiagnostic &DiagID,
-                                    const PartialDiagnostic &NestedDiagID,
-                                    const PartialDiagnostic &NoteID,
-                                    const PartialDiagnostic &NoThrowDiagID,
-                                    const FunctionProtoType *Superset,
-                                    SourceLocation SuperLoc,
-                                    const FunctionProtoType *Subset,
-                                    SourceLocation SubLoc) {
+bool Sema::CheckExceptionSpecSubset(
+    const PartialDiagnostic &DiagID, const PartialDiagnostic &NestedDiagID,
+    const PartialDiagnostic &NoteID, const PartialDiagnostic &NoThrowDiagID,
+    const FunctionProtoType *Superset, bool SkipSupersetFirstParameter,
+    SourceLocation SuperLoc, const FunctionProtoType *Subset,
+    bool SkipSubsetFirstParameter, SourceLocation SubLoc) {
 
   // Just auto-succeed under -fno-exceptions.
   if (!getLangOpts().CXXExceptions)
@@ -816,8 +814,9 @@ bool Sema::CheckExceptionSpecSubset(const PartialDiagnostic &DiagID,
   // done.
   if ((SuperCanThrow == CT_Can && SuperEST != EST_Dynamic) ||
       SubCanThrow == CT_Cannot)
-    return CheckParamExceptionSpec(NestedDiagID, NoteID, Superset, SuperLoc,
-                                   Subset, SubLoc);
+    return CheckParamExceptionSpec(NestedDiagID, NoteID, Superset,
+                                   SkipSupersetFirstParameter, SuperLoc, Subset,
+                                   SkipSubsetFirstParameter, SubLoc);
 
   // Allow __declspec(nothrow) to be missing on redeclaration as an extension in
   // some cases.
@@ -869,8 +868,9 @@ bool Sema::CheckExceptionSpecSubset(const PartialDiagnostic &DiagID,
     }
   }
   // We've run half the gauntlet.
-  return CheckParamExceptionSpec(NestedDiagID, NoteID, Superset, SuperLoc,
-                                 Subset, SubLoc);
+  return CheckParamExceptionSpec(NestedDiagID, NoteID, Superset,
+                                 SkipSupersetFirstParameter, SuperLoc, Subset,
+                                 SkipSupersetFirstParameter, SubLoc);
 }
 
 static bool
@@ -894,12 +894,11 @@ CheckSpecForTypesEquivalent(Sema &S, const PartialDiagnostic &DiagID,
 /// assignment and override compatibility check. We do not check the parameters
 /// of parameter function pointers recursively, as no sane programmer would
 /// even be able to write such a function type.
-bool Sema::CheckParamExceptionSpec(const PartialDiagnostic &DiagID,
-                                   const PartialDiagnostic &NoteID,
-                                   const FunctionProtoType *Target,
-                                   SourceLocation TargetLoc,
-                                   const FunctionProtoType *Source,
-                                   SourceLocation SourceLoc) {
+bool Sema::CheckParamExceptionSpec(
+    const PartialDiagnostic &DiagID, const PartialDiagnostic &NoteID,
+    const FunctionProtoType *Target, bool SkipTargetFirstParameter,
+    SourceLocation TargetLoc, const FunctionProtoType *Source,
+    bool SkipSourceFirstParameter, SourceLocation SourceLoc) {
   auto RetDiag = DiagID;
   RetDiag << 0;
   if (CheckSpecForTypesEquivalent(
@@ -910,14 +909,16 @@ bool Sema::CheckParamExceptionSpec(const PartialDiagnostic &DiagID,
 
   // We shouldn't even be testing this unless the arguments are otherwise
   // compatible.
-  assert(Target->getNumParams() == Source->getNumParams() &&
+  assert((Target->getNumParams() - (unsigned)SkipTargetFirstParameter) ==
+             (Source->getNumParams() - (unsigned)SkipSourceFirstParameter) &&
          "Functions have different argument counts.");
   for (unsigned i = 0, E = Target->getNumParams(); i != E; ++i) {
     auto ParamDiag = DiagID;
     ParamDiag << 1;
     if (CheckSpecForTypesEquivalent(
             *this, ParamDiag, PDiag(),
-            Target->getParamType(i), TargetLoc, Source->getParamType(i),
+            Target->getParamType(i + (SkipTargetFirstParameter ? 1 : 0)),
+            TargetLoc, Source->getParamType(SkipSourceFirstParameter ? 1 : 0),
             SourceLoc))
       return true;
   }
@@ -958,9 +959,10 @@ bool Sema::CheckExceptionSpecCompatibility(Expr *From, QualType ToType) {
   //     void (*q)(void (*) throw(int)) = p;
   //   }
   // ... because it might be instantiated with T=int.
-  return CheckExceptionSpecSubset(
-             PDiag(DiagID), PDiag(NestedDiagID), PDiag(), PDiag(), ToFunc,
-             From->getSourceRange().getBegin(), FromFunc, SourceLocation()) &&
+  return CheckExceptionSpecSubset(PDiag(DiagID), PDiag(NestedDiagID), PDiag(),
+                                  PDiag(), ToFunc, 0,
+                                  From->getSourceRange().getBegin(), FromFunc,
+                                  0, SourceLocation()) &&
          !getLangOpts().CPlusPlus17;
 }
 
@@ -989,14 +991,14 @@ bool Sema::CheckOverridingFunctionExceptionSpec(const CXXMethodDecl *New,
   unsigned DiagID = diag::err_override_exception_spec;
   if (getLangOpts().MSVCCompat)
     DiagID = diag::ext_override_exception_spec;
-  return CheckExceptionSpecSubset(PDiag(DiagID),
-                                  PDiag(diag::err_deep_exception_specs_differ),
-                                  PDiag(diag::note_overridden_virtual_function),
-                                  PDiag(diag::ext_override_exception_spec),
-                                  Old->getType()->castAs<FunctionProtoType>(),
-                                  Old->getLocation(),
-                                  New->getType()->castAs<FunctionProtoType>(),
-                                  New->getLocation());
+  return CheckExceptionSpecSubset(
+      PDiag(DiagID), PDiag(diag::err_deep_exception_specs_differ),
+      PDiag(diag::note_overridden_virtual_function),
+      PDiag(diag::ext_override_exception_spec),
+      Old->getType()->castAs<FunctionProtoType>(),
+      Old->hasCXXExplicitFunctionObjectParameter(), Old->getLocation(),
+      New->getType()->castAs<FunctionProtoType>(),
+      New->hasCXXExplicitFunctionObjectParameter(), New->getLocation());
 }
 
 static CanThrowResult canSubStmtsThrow(Sema &Self, const Stmt *S) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 92496b03ecabe5..9c5f96eebd0416 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2444,7 +2444,7 @@ bool Sema::DiagnoseDependentMemberLookup(const LookupResult &R) {
   // FIXME: Is this special case necessary? We could allow the caller to
   // diagnose this.
   if (isDefaultArgument && ((*R.begin())->isCXXInstanceMember())) {
-    Diag(R.getNameLoc(), diag::err_member_call_without_object);
+    Diag(R.getNameLoc(), diag::err_member_call_without_object) << 0;
     return true;
   }
 
@@ -3217,11 +3217,11 @@ Sema::PerformObjectMemberConversion(Expr *From,
       FromRecordType = FromType;
     }
   } else if (const auto *Method = dyn_cast<CXXMethodDecl>(Member)) {
-    if (Method->isStatic())
+    if (!Method->isImplicitObjectMemberFunction())
       return From;
 
     DestType = Method->getThisType().getNonReferenceType();
-    DestRecordType = Method->getThisObjectType();
+    DestRecordType = Method->getFunctionObjectParameterType();
 
     if (FromType->getAs<PointerType>()) {
       FromRecordType = FromType->getPointeeType();
@@ -6503,6 +6503,9 @@ Sema::ConvertArgumentsForCall(CallExpr *Call, Expr *Fn,
 
   // C99 6.5.2.2p7 - the arguments are implicitly converted, as if by
   // assignment, to the types of the corresponding parameter, ...
+  bool HasExplicitObjectParameter =
+      FDecl && FDecl->hasCXXExplicitFunctionObjectParameter();
+  unsigned ExplicitObjectParameterOffset = HasExplicitObjectParameter ? 1 : 0;
   unsigned NumParams = Proto->getNumParams();
   bool Invalid = false;
   unsigned MinArgs = FDecl ? FDecl->getMinRequiredArguments() : NumParams;
@@ -6521,21 +6524,29 @@ Sema::ConvertArgumentsForCall(CallExpr *Call, Expr *Fn,
             MinArgs == NumParams && !Proto->isVariadic()
                 ? diag::err_typecheck_call_too_few_args_suggest
                 : diag::err_typecheck_call_too_few_args_at_least_suggest;
-        diagnoseTypo(TC, PDiag(diag_id) << FnKind << MinArgs
-                                        << static_cast<unsigned>(Args.size())
-                                        << TC.getCorrectionRange());
-      } else if (MinArgs == 1 && FDecl && FDecl->getParamDecl(0)->getDeclName())
+        diagnoseTypo(
+            TC, PDiag(diag_id)
+                    << FnKind << MinArgs - ExplicitObjectParameterOffset
+                    << static_cast<unsigned>(Args.size()) -
+                           ExplicitObjectParameterOffset
+                    << HasExplicitObjectParameter << TC.getCorrectionRange());
+      } else if (MinArgs - ExplicitObjectParameterOffset == 1 && FDecl &&
+                 FDecl->getParamDecl(ExplicitObjectParameterOffset)
+                     ->getDeclName())
         Diag(RParenLoc,
              MinArgs == NumParams && !Proto->isVariadic()
                  ? diag::err_typecheck_call_too_few_args_one
                  : diag::err_typecheck_call_too_few_args_at_least_one)
-            << FnKind << FDecl->getParamDecl(0) << Fn->getSourceRange();
+            << FnKind << FDecl->getParamDecl(ExplicitObjectParameterOffset)
+            << HasExplicitObjectParameter << Fn->getSourceRange();
       else
         Diag(RParenLoc, MinArgs == NumParams && !Proto->isVariadic()
                             ? diag::err_typecheck_call_too_few_args
                             : diag::err_typecheck_call_too_few_args_at_least)
-            << FnKind << MinArgs << static_cast<unsigned>(Args.size())
-            << Fn->getSourceRange();
+            << FnKind << MinArgs - ExplicitObjectParameterOffset
+            << static_cast<unsigned>(Args.size()) -
+                   ExplicitObjectParameterOffset
+            << HasExplicitObjectParameter << Fn->getSourceRange();
 
       // Emit the location of the prototype.
       if (!TC && FDecl && !FDecl->getBuiltinID() && !IsExecConfig)
@@ -6560,17 +6571,23 @@ Sema::ConvertArgumentsForCall(CallExpr *Call, Expr *Fn,
             MinArgs == NumParams && !Proto->isVariadic()
                 ? diag::err_typecheck_call_too_many_args_suggest
                 : diag::err_typecheck_call_too_many_args_at_most_suggest;
-        diagnoseTypo(TC, PDiag(diag_id) << FnKind << NumParams
-                                        << static_cast<unsigned>(Args.size())
-                                        << TC.getCorrectionRange());
-      } else if (NumParams == 1 && FDecl &&
-                 FDecl->getParamDecl(0)->getDeclName())
+        diagnoseTypo(
+            TC, PDiag(diag_id)
+                    << FnKind << NumParams - ExplicitObjectParameterOffset
+                    << static_cast<unsigned>(Args.size()) -
+                           ExplicitObjectParameterOffset
+                    << HasExplicitObjectParameter << TC.getCorrectionRange());
+      } else if (NumParams - ExplicitObjectParameterOffset == 1 && FDecl &&
+                 FDecl->getParamDecl(ExplicitObjectParameterOffset)
+                     ->getDeclName())
         Diag(Args[NumParams]->getBeginLoc(),
              MinArgs == NumParams
                  ? diag::err_typecheck_call_too_many_args_one
                  : diag::err_typecheck_call_too_many_args_at_most_one)
-            << FnKind << FDecl->getParamDecl(0)
-            << static_cast<unsigned>(Args.size()) << Fn->getSourceRange()
+            << FnKind << FDecl->getParamDecl(ExplicitObjectParameterOffset)
+            << static_cast<unsigned>(Args.size()) -
+                   ExplicitObjectParameterOffset
+            << HasExplicitObjectParameter << Fn->getSourceRange()
             << SourceRange(Args[NumParams]->getBeginLoc(),
                            Args.back()->getEndLoc());
       else
@@ -6578,8 +6595,10 @@ Sema::ConvertArgumentsForCall(CallExpr *Call, Expr *Fn,
              MinArgs == NumParams
                  ? diag::err_typecheck_call_too_many_args
                  : diag::err_typecheck_call_too_many_args_at_most)
-            << FnKind << NumParams << static_cast<unsigned>(Args.size())
-            << Fn->getSourceRange()
+            << FnKind << NumParams - ExplicitObjectParameterOffset
+            << static_cast<unsigned>(Args.size()) -
+                   ExplicitObjectParameterOffset
+            << HasExplicitObjectParameter << Fn->getSourceRange()
             << SourceRange(Args[NumParams]->getBeginLoc(),
                            Args.back()->getEndLoc());
 
@@ -7661,9 +7680,9 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
   }
 
   if (CXXMethodDecl *Method = dyn_cast_or_null<CXXMethodDecl>(FDecl))
-    if (!Method->isStatic())
+    if (Method->isImplicitObjectMemberFunction())
       return ExprError(Diag(LParenLoc, diag::err_member_call_without_object)
-        << Fn->getSourceRange());
+                       << Fn->getSourceRange() << 0);
 
   // Check for sentinels
   if (NDecl)
@@ -14955,6 +14974,34 @@ static void diagnoseAddressOfInvalidType(Sema &S, SourceLocation Loc,
   S.Diag(Loc, diag::err_typecheck_address_of) << Type << E->getSourceRange();
 }
 
+bool Sema::CheckUseOfCXXMethodAsAddressOfOperand(SourceLocation OpLoc,
+                                                 const Expr *Op,
+                                                 const CXXMethodDecl *MD) {
+  const auto *DRE = cast<DeclRefExpr>(Op->IgnoreParens());
+
+  if (Op != DRE)
+    return Diag(OpLoc, diag::err_parens_pointer_member_function)
+           << Op->getSourceRange();
+
+  // Taking the address of a dtor is illegal per C++ [class.dtor]p2.
+  if (isa<CXXDestructorDecl>(MD))
+    return Diag(OpLoc, diag::err_typecheck_addrof_dtor)
+           << DRE->getSourceRange();
+
+  if (DRE->getQualifier())
+    return false;
+
+  if (MD->getParent()->getName().empty())
+    return Diag(OpLoc, diag::err_unqualified_pointer_member_function)
+           << DRE->getSourceRange();
+
+  SmallString<32> Str;
+  StringRef Qual = (MD->getParent()->getName() + "::").toStringRef(Str);
+  return Diag(OpLoc, diag::err_unqualified_pointer_member_function)
+         << DRE->getSourceRange()
+         << FixItHint::CreateInsertion(DRE->getSourceRange().getBegin(), Qual);
+}
+
 /// CheckAddressOfOperand - The operand of & must be either a function
 /// designator or an lvalue designating an object. If it is an lvalue, the
 /// object cannot be declared with storage class register or be a bit field.
@@ -15064,28 +15111,7 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
     DeclRefExpr *DRE = cast<DeclRefExpr>(op);
     CXXMethodDecl *MD = cast<CXXMethodDecl>(DRE->getDecl());
 
-    // The id-expression was parenthesized.
-    if (OrigOp.get() != DRE) {
-      Diag(OpLoc, diag::err_parens_pointer_member_function)
-        << OrigOp.get()->getSourceRange();
-
-    // The method was named without a qualifier.
-    } else if (!DRE->getQualifier()) {
-      if (MD->getParent()->getName().empty())
-        Diag(OpLoc, diag::err_unqualified_pointer_member_function)
-          << op->getSourceRange();
-      else {
-        SmallString<32> Str;
-        StringRef Qual = (MD->getParent()->getName() + "::").toStringRef(Str);
-        Diag(OpLoc, diag::err_unqualified_pointer_member_function)
-          << op->getSourceRange()
-          << FixItHint::CreateInsertion(op->getSourceRange().getBegin(), Qual);
-      }
-    }
-
-    // Taking the address of a dtor is illegal per C++ [class.dtor]p2.
-    if (isa<CXXDestructorDecl>(MD))
-      Diag(OpLoc, diag::err_typecheck_addrof_dtor) << op->getSourceRange();
+    CheckUseOfCXXMethodAsAddressOfOperand(OpLoc, OrigOp.get(), MD);
 
     QualType MPTy = Context.getMemberPointerType(
         op->getType(), Context.getTypeDeclType(MD->getParent()).getTypePtr());
@@ -15105,7 +15131,11 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
           << op->getType() << op->getSourceRange();
         return QualType();
       }
+    } else if (const auto *DRE = dyn_cast<DeclRefExpr>(op)) {
+      if (const auto *MD = dyn_cast_or_null<CXXMethodDecl>(DRE->getDecl()))
+        CheckUseOfCXXMethodAsAddressOfOperand(OpLoc, OrigOp.get(), MD);
     }
+
   } else if (op->getObjectKind() == OK_BitField) { // C99 6.5.3.2p1
     // The operand cannot be a bit-field
     AddressOfError = AO_Bit_Field;
@@ -16496,7 +16526,7 @@ bool Sema::isQualifiedMemberAccess(Expr *E) {
     if (isa<FieldDecl>(VD) || isa<IndirectFieldDecl>(VD))
       return true;
     if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(VD))
-      return Method->isInstance();
+      return Method->isImplicitObjectMemberFunction();
 
     return false;
   }
@@ -16507,7 +16537,7 @@ bool Sema::isQualifiedMemberAccess(Expr *E) {
 
     for (NamedDecl *D : ULE->decls()) {
       if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D)) {
-        if (Method->isInstance())
+        if (Method->isImplicitObjectMemberFunction())
           return true;
       } else {
         // Overload set does not contain methods.
@@ -18282,7 +18312,7 @@ void Sema::MarkExpressionAsImmediateEscalating(Expr *E) {
 
 ExprResult Sema::CheckForImmediateInvocation(ExprResult E, FunctionDecl *Decl) {
   if (isUnevaluatedContext() || !E.isUsable() || !Decl ||
-      !Decl->isImmediateFunction() || isConstantEvaluated() ||
+      !Decl->isImmediateFunction() || isAlwaysConstantEvaluatedContext() ||
       isCheckingDefaultArgumentOrInitializer() ||
       RebuildingImmediateInvocation || isImmediateFunctionContext())
     return E;
@@ -18438,7 +18468,10 @@ static void RemoveNestedImmediateInvocation(
       DRSet.erase(cast<DeclRefExpr>(E->getCallee()->IgnoreImplicit()));
       return Base::TransformCXXOperatorCallExpr(E);
     }
-    /// Base::TransformInitializer skip ConstantExpr so we need to visit them
+    /// Base::TransformUserDefinedLiteral doesn't preserve the
+    /// UserDefinedLiteral node.
+    ExprResult TransformUserDefinedLiteral(UserDefinedLiteral *E) { return E; }
+    /// Base::TransformInitializer skips ConstantExpr so we need to visit them
     /// here.
     ExprResult TransformInitializer(Expr *Init, bool NotCopyInit) {
       if (!Init)
@@ -19127,7 +19160,7 @@ MarkVarDeclODRUsed(ValueDecl *V, SourceLocation Loc, Sema &SemaRef,
       // Diagnose ODR-use of host global variables in device functions.
       // Reference of device global variables in host functions is allowed
       // through shadow variables therefore it is not diagnosed.
-      if (SemaRef.LangOpts.CUDAIsDevice) {
+      if (SemaRef.LangOpts.CUDAIsDevice && !SemaRef.LangOpts.HIPStdPar) {
         SemaRef.targetDiag(Loc, diag::err_ref_bad_target)
             << /*host*/ 2 << /*variable*/ 1 << Var << UserTarget;
         SemaRef.targetDiag(Var->getLocation(),
@@ -19227,7 +19260,8 @@ static bool isVariableAlreadyCapturedInScopeInfo(CapturingScopeInfo *CSI,
     // private instances of the captured declarations.
     const Capture &Cap = CSI->getCapture(Var);
     if (Cap.isCopyCapture() &&
-        !(isa<LambdaScopeInfo>(CSI) && cast<LambdaScopeInfo>(CSI)->Mutable) &&
+        !(isa<LambdaScopeInfo>(CSI) &&
+          !cast<LambdaScopeInfo>(CSI)->lambdaCaptureShouldBeConst()) &&
         !(isa<CapturedRegionScopeInfo>(CSI) &&
           cast<CapturedRegionScopeInfo>(CSI)->CapRegionKind == CR_OpenMP))
       DeclRefType.addConst();
@@ -19544,7 +19578,8 @@ static bool captureInLambda(LambdaScopeInfo *LSI, ValueDecl *Var,
     //   declared const (9.3.1) if and only if the lambda-expression's
     //   parameter-declaration-clause is not followed by mutable.
     DeclRefType = CaptureType.getNonReferenceType();
-    if (!LSI->Mutable && !CaptureType->isReferenceType())
+    bool Const = LSI->lambdaCaptureShouldBeConst();
+    if (Const && !CaptureType->isReferenceType())
       DeclRefType.addConst();
   }
 
@@ -20612,6 +20647,34 @@ void Sema::MarkVariableReferenced(SourceLocation Loc, VarDecl *Var) {
   DoMarkVarDeclReferenced(*this, Loc, Var, nullptr, RefsMinusAssignments);
 }
 
+// C++ [temp.dep.expr]p3:
+//   An id-expression is type-dependent if it contains:
+//     - an identifier associated by name lookup with an entity captured by copy
+//       in a lambda-expression that has an explicit object parameter whose type
+//       is dependent ([dcl.fct]),
+static void FixDependencyOfIdExpressionsInLambdaWithDependentObjectParameter(
+    Sema &SemaRef, ValueDecl *D, Expr *E) {
+  auto *ID = dyn_cast<DeclRefExpr>(E);
+  if (!ID || ID->isTypeDependent())
+    return;
+
+  auto IsDependent = [&]() {
+    const LambdaScopeInfo *LSI = SemaRef.getCurLambda();
+    if (!LSI)
+      return false;
+    if (!LSI->ExplicitObjectParameter ||
+        !LSI->ExplicitObjectParameter->getType()->isDependentType())
+      return false;
+    if (!LSI->CaptureMap.count(D))
+      return false;
+    const Capture &Cap = LSI->getCapture(D);
+    return !Cap.isCopyCapture();
+  }();
+
+  ID->setCapturedByCopyInLambdaWithExplicitObjectParameter(
+      IsDependent, SemaRef.getASTContext());
+}
+
 static void
 MarkExprReferenced(Sema &SemaRef, SourceLocation Loc, Decl *D, Expr *E,
                    bool MightBeOdrUse,
@@ -20621,14 +20684,19 @@ MarkExprReferenced(Sema &SemaRef, SourceLocation Loc, Decl *D, Expr *E,
 
   if (VarDecl *Var = dyn_cast<VarDecl>(D)) {
     DoMarkVarDeclReferenced(SemaRef, Loc, Var, E, RefsMinusAssignments);
+    if (SemaRef.getLangOpts().CPlusPlus)
+      FixDependencyOfIdExpressionsInLambdaWithDependentObjectParameter(SemaRef,
+                                                                       Var, E);
     return;
   }
 
   if (BindingDecl *Decl = dyn_cast<BindingDecl>(D)) {
     DoMarkBindingDeclReferenced(SemaRef, Loc, Decl, E);
+    if (SemaRef.getLangOpts().CPlusPlus)
+      FixDependencyOfIdExpressionsInLambdaWithDependentObjectParameter(SemaRef,
+                                                                       Decl, E);
     return;
   }
-
   SemaRef.MarkAnyDeclReferenced(Loc, D, MightBeOdrUse);
 
   // If this is a call to a method via a cast, also mark the method in the
@@ -20668,7 +20736,7 @@ void Sema::MarkDeclRefReferenced(DeclRefExpr *E, const Expr *Base) {
       OdrUse = false;
 
   if (auto *FD = dyn_cast<FunctionDecl>(E->getDecl())) {
-    if (!isUnevaluatedContext() && !isConstantEvaluated() &&
+    if (!isUnevaluatedContext() && !isConstantEvaluatedContext() &&
         !isImmediateFunctionContext() &&
         !isCheckingDefaultArgumentOrInitializer() &&
         FD->isImmediateFunction() && !RebuildingImmediateInvocation &&
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index c147cc5293fd78..1153049496d129 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1160,7 +1160,7 @@ static QualType adjustCVQualifiersForCXXThisWithinLambda(
     auto C = CurLSI->getCXXThisCapture();
 
     if (C.isCopyCapture()) {
-      if (!CurLSI->Mutable)
+      if (CurLSI->lambdaCaptureShouldBeConst())
         ClassType.addConst();
       return ASTCtx.getPointerType(ClassType);
     }
@@ -1216,11 +1216,11 @@ QualType Sema::getCurrentThisType() {
   QualType ThisTy = CXXThisTypeOverride;
 
   if (CXXMethodDecl *method = dyn_cast<CXXMethodDecl>(DC)) {
-    if (method && method->isInstance())
+    if (method && method->isImplicitObjectMemberFunction())
       ThisTy = method->getThisType().getNonReferenceType();
   }
 
-  if (ThisTy.isNull() && isLambdaCallOperator(CurContext) &&
+  if (ThisTy.isNull() && isLambdaCallWithImplicitObjectParameter(CurContext) &&
       inTemplateInstantiation() && isa<CXXRecordDecl>(DC)) {
 
     // This is a lambda call operator that is being instantiated as a default
@@ -1398,10 +1398,22 @@ ExprResult Sema::ActOnCXXThis(SourceLocation Loc) {
   /// C++ 9.3.2: In the body of a non-static member function, the keyword this
   /// is a non-lvalue expression whose value is the address of the object for
   /// which the function is called.
-
   QualType ThisTy = getCurrentThisType();
-  if (ThisTy.isNull())
-    return Diag(Loc, diag::err_invalid_this_use);
+
+  if (ThisTy.isNull()) {
+    DeclContext *DC = getFunctionLevelDeclContext();
+
+    if (const auto *Method = dyn_cast<CXXMethodDecl>(DC);
+        Method && Method->isExplicitObjectMemberFunction()) {
+      return Diag(Loc, diag::err_invalid_this_use) << 1;
+    }
+
+    if (isLambdaCallWithExplicitObjectParameter(CurContext))
+      return Diag(Loc, diag::err_invalid_this_use) << 1;
+
+    return Diag(Loc, diag::err_invalid_this_use) << 0;
+  }
+
   return BuildCXXThisExpr(Loc, ThisTy, /*IsImplicit=*/false);
 }
 
@@ -3960,7 +3972,7 @@ void Sema::CheckVirtualDtorCall(CXXDestructorDecl *dtor, SourceLocation Loc,
   if (getSourceManager().isInSystemHeader(PointeeRD->getLocation()))
     return;
 
-  QualType ClassType = dtor->getThisObjectType();
+  QualType ClassType = dtor->getFunctionObjectParameterType();
   if (PointeeRD->isAbstract()) {
     // If the class is abstract, we warn by default, because we're
     // sure the code has undefined behavior.
@@ -4319,15 +4331,17 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
     if (DiagnoseUseOfDecl(Fn, From->getBeginLoc()))
       return ExprError();
 
-    From = FixOverloadedFunctionReference(From, Found, Fn);
+    ExprResult Res = FixOverloadedFunctionReference(From, Found, Fn);
+    if (Res.isInvalid())
+      return ExprError();
 
     // We might get back another placeholder expression if we resolved to a
     // builtin.
-    ExprResult Checked = CheckPlaceholderExpr(From);
-    if (Checked.isInvalid())
+    Res = CheckPlaceholderExpr(Res.get());
+    if (Res.isInvalid())
       return ExprError();
 
-    From = Checked.get();
+    From = Res.get();
     FromType = From->getType();
   }
 
@@ -8052,68 +8066,6 @@ ExprResult Sema::ActOnPseudoDestructorExpr(Scope *S, Expr *Base,
                                    Destructed);
 }
 
-ExprResult Sema::BuildCXXMemberCallExpr(Expr *E, NamedDecl *FoundDecl,
-                                        CXXConversionDecl *Method,
-                                        bool HadMultipleCandidates) {
-  // Convert the expression to match the conversion function's implicit object
-  // parameter.
-  ExprResult Exp = PerformObjectArgumentInitialization(E, /*Qualifier=*/nullptr,
-                                          FoundDecl, Method);
-  if (Exp.isInvalid())
-    return true;
-
-  if (Method->getParent()->isLambda() &&
-      Method->getConversionType()->isBlockPointerType()) {
-    // This is a lambda conversion to block pointer; check if the argument
-    // was a LambdaExpr.
-    Expr *SubE = E;
-    CastExpr *CE = dyn_cast<CastExpr>(SubE);
-    if (CE && CE->getCastKind() == CK_NoOp)
-      SubE = CE->getSubExpr();
-    SubE = SubE->IgnoreParens();
-    if (CXXBindTemporaryExpr *BE = dyn_cast<CXXBindTemporaryExpr>(SubE))
-      SubE = BE->getSubExpr();
-    if (isa<LambdaExpr>(SubE)) {
-      // For the conversion to block pointer on a lambda expression, we
-      // construct a special BlockLiteral instead; this doesn't really make
-      // a difference in ARC, but outside of ARC the resulting block literal
-      // follows the normal lifetime rules for block literals instead of being
-      // autoreleased.
-      PushExpressionEvaluationContext(
-          ExpressionEvaluationContext::PotentiallyEvaluated);
-      ExprResult BlockExp = BuildBlockForLambdaConversion(
-          Exp.get()->getExprLoc(), Exp.get()->getExprLoc(), Method, Exp.get());
-      PopExpressionEvaluationContext();
-
-      // FIXME: This note should be produced by a CodeSynthesisContext.
-      if (BlockExp.isInvalid())
-        Diag(Exp.get()->getExprLoc(), diag::note_lambda_to_block_conv);
-      return BlockExp;
-    }
-  }
-
-  MemberExpr *ME =
-      BuildMemberExpr(Exp.get(), /*IsArrow=*/false, SourceLocation(),
-                      NestedNameSpecifierLoc(), SourceLocation(), Method,
-                      DeclAccessPair::make(FoundDecl, FoundDecl->getAccess()),
-                      HadMultipleCandidates, DeclarationNameInfo(),
-                      Context.BoundMemberTy, VK_PRValue, OK_Ordinary);
-
-  QualType ResultType = Method->getReturnType();
-  ExprValueKind VK = Expr::getValueKindForType(ResultType);
-  ResultType = ResultType.getNonLValueExprType(Context);
-
-  CXXMemberCallExpr *CE = CXXMemberCallExpr::Create(
-      Context, ME, /*Args=*/{}, ResultType, VK, Exp.get()->getEndLoc(),
-      CurFPFeatureOverrides());
-
-  if (CheckFunctionCall(Method, CE,
-                        Method->getType()->castAs<FunctionProtoType>()))
-    return ExprError();
-
-  return CheckForImmediateInvocation(CE, CE->getMethodDecl());
-}
-
 ExprResult Sema::BuildCXXNoexceptExpr(SourceLocation KeyLoc, Expr *Operand,
                                       SourceLocation RParen) {
   // If the operand is an unresolved lookup expression, the expression is ill-
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index fe92215ae46776..bd85b548c0dd60 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -46,7 +46,7 @@ enum IMAKind {
 
   /// The reference may be to an instance member, but it might be invalid if
   /// so, because the context is not an instance method.
-  IMA_Mixed_StaticContext,
+  IMA_Mixed_StaticOrExplicitContext,
 
   /// The reference may be to an instance member, but it is invalid if
   /// so, because the context is from an unrelated class.
@@ -63,7 +63,7 @@ enum IMAKind {
 
   /// The reference may be to an unresolved using declaration and the
   /// context is not an instance method.
-  IMA_Unresolved_StaticContext,
+  IMA_Unresolved_StaticOrExplicitContext,
 
   // The reference refers to a field which is not a member of the containing
   // class, which is allowed because we're in C++11 mode and the context is
@@ -72,7 +72,7 @@ enum IMAKind {
 
   /// All possible referrents are instance members and the current
   /// context is not an instance method.
-  IMA_Error_StaticContext,
+  IMA_Error_StaticOrExplicitContext,
 
   /// All possible referrents are instance members of an unrelated
   /// class.
@@ -91,11 +91,14 @@ static IMAKind ClassifyImplicitMemberAccess(Sema &SemaRef,
 
   DeclContext *DC = SemaRef.getFunctionLevelDeclContext();
 
-  bool isStaticContext = SemaRef.CXXThisTypeOverride.isNull() &&
-    (!isa<CXXMethodDecl>(DC) || cast<CXXMethodDecl>(DC)->isStatic());
+  bool isStaticOrExplicitContext =
+      SemaRef.CXXThisTypeOverride.isNull() &&
+      (!isa<CXXMethodDecl>(DC) || cast<CXXMethodDecl>(DC)->isStatic() ||
+       cast<CXXMethodDecl>(DC)->isExplicitObjectMemberFunction());
 
   if (R.isUnresolvableResult())
-    return isStaticContext ? IMA_Unresolved_StaticContext : IMA_Unresolved;
+    return isStaticOrExplicitContext ? IMA_Unresolved_StaticOrExplicitContext
+                                     : IMA_Unresolved;
 
   // Collect all the declaring classes of instance members we find.
   bool hasNonInstance = false;
@@ -152,12 +155,12 @@ static IMAKind ClassifyImplicitMemberAccess(Sema &SemaRef,
 
   // If the current context is not an instance method, it can't be
   // an implicit member reference.
-  if (isStaticContext) {
+  if (isStaticOrExplicitContext) {
     if (hasNonInstance)
-      return IMA_Mixed_StaticContext;
+      return IMA_Mixed_StaticOrExplicitContext;
 
     return AbstractInstanceResult ? AbstractInstanceResult
-                                  : IMA_Error_StaticContext;
+                                  : IMA_Error_StaticOrExplicitContext;
   }
 
   CXXRecordDecl *contextClass;
@@ -167,7 +170,7 @@ static IMAKind ClassifyImplicitMemberAccess(Sema &SemaRef,
     contextClass = RD;
   else
     return AbstractInstanceResult ? AbstractInstanceResult
-                                  : IMA_Error_StaticContext;
+                                  : IMA_Error_StaticOrExplicitContext;
 
   // [class.mfct.non-static]p3:
   // ...is used in the body of a non-static member function of class X,
@@ -214,14 +217,31 @@ static void diagnoseInstanceReference(Sema &SemaRef,
   CXXRecordDecl *RepClass = dyn_cast<CXXRecordDecl>(Rep->getDeclContext());
 
   bool InStaticMethod = Method && Method->isStatic();
+  bool InExplicitObjectMethod =
+      Method && Method->isExplicitObjectMemberFunction();
   bool IsField = isa<FieldDecl>(Rep) || isa<IndirectFieldDecl>(Rep);
 
+  std::string Replacement;
+  if (InExplicitObjectMethod) {
+    DeclarationName N = Method->getParamDecl(0)->getDeclName();
+    if (!N.isEmpty()) {
+      Replacement.append(N.getAsString());
+      Replacement.append(".");
+    }
+  }
   if (IsField && InStaticMethod)
     // "invalid use of member 'x' in static member function"
-    SemaRef.Diag(Loc, diag::err_invalid_member_use_in_static_method)
-        << Range << nameInfo.getName();
-  else if (ContextClass && RepClass && SS.isEmpty() && !InStaticMethod &&
-           !RepClass->Equals(ContextClass) && RepClass->Encloses(ContextClass))
+    SemaRef.Diag(Loc, diag::err_invalid_member_use_in_method)
+        << Range << nameInfo.getName() << /*static*/ 0;
+  else if (IsField && InExplicitObjectMethod) {
+    auto Diag = SemaRef.Diag(Loc, diag::err_invalid_member_use_in_method)
+                << Range << nameInfo.getName() << /*explicit*/ 1;
+    if (!Replacement.empty())
+      Diag << FixItHint::CreateInsertion(Loc, Replacement);
+  } else if (ContextClass && RepClass && SS.isEmpty() &&
+             !InExplicitObjectMethod && !InStaticMethod &&
+             !RepClass->Equals(ContextClass) &&
+             RepClass->Encloses(ContextClass))
     // Unqualified lookup in a non-static member function found a member of an
     // enclosing class.
     SemaRef.Diag(Loc, diag::err_nested_non_static_member_use)
@@ -229,9 +249,16 @@ static void diagnoseInstanceReference(Sema &SemaRef,
   else if (IsField)
     SemaRef.Diag(Loc, diag::err_invalid_non_static_member_use)
       << nameInfo.getName() << Range;
-  else
+  else if (!InExplicitObjectMethod)
     SemaRef.Diag(Loc, diag::err_member_call_without_object)
-      << Range;
+        << Range << /*static*/ 0;
+  else {
+    const auto *Callee = dyn_cast<CXXMethodDecl>(Rep);
+    auto Diag = SemaRef.Diag(Loc, diag::err_member_call_without_object)
+                << Range << Callee->isExplicitObjectMemberFunction();
+    if (!Replacement.empty())
+      Diag << FixItHint::CreateInsertion(Loc, Replacement);
+  }
 }
 
 /// Builds an expression which might be an implicit member expression.
@@ -255,13 +282,13 @@ ExprResult Sema::BuildPossibleImplicitMemberExpr(
     [[fallthrough]];
   case IMA_Static:
   case IMA_Abstract:
-  case IMA_Mixed_StaticContext:
-  case IMA_Unresolved_StaticContext:
+  case IMA_Mixed_StaticOrExplicitContext:
+  case IMA_Unresolved_StaticOrExplicitContext:
     if (TemplateArgs || TemplateKWLoc.isValid())
       return BuildTemplateIdExpr(SS, TemplateKWLoc, R, false, TemplateArgs);
     return AsULE ? AsULE : BuildDeclarationNameExpr(SS, R, false);
 
-  case IMA_Error_StaticContext:
+  case IMA_Error_StaticOrExplicitContext:
   case IMA_Error_Unrelated:
     diagnoseInstanceReference(*this, SS, R.getRepresentativeDecl(),
                               R.getLookupNameInfo());
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 5df830e5bee6d6..33e98bc2d6f713 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -1800,7 +1800,8 @@ bool Sema::CheckMessageArgumentTypes(
   // FIXME. This need be cleaned up.
   if (Args.size() < NumNamedArgs) {
     Diag(SelLoc, diag::err_typecheck_call_too_few_args)
-      << 2 << NumNamedArgs << static_cast<unsigned>(Args.size());
+        << 2 << NumNamedArgs << static_cast<unsigned>(Args.size())
+        << /*is non object*/ 0;
     return false;
   }
 
@@ -1898,7 +1899,7 @@ bool Sema::CheckMessageArgumentTypes(
       Diag(Args[NumNamedArgs]->getBeginLoc(),
            diag::err_typecheck_call_too_many_args)
           << 2 /*method*/ << NumNamedArgs << static_cast<unsigned>(Args.size())
-          << Method->getSourceRange()
+          << Method->getSourceRange() << /*is non object*/ 0
           << SourceRange(Args[NumNamedArgs]->getBeginLoc(),
                          Args.back()->getEndLoc());
     }
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index de576cc52c42e7..fd95b16b84b6e3 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -7421,8 +7421,9 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
       return true;
   if (!isInStlNamespace(Callee->getParent()))
     return false;
-  if (!isRecordWithAttr<PointerAttr>(Callee->getThisObjectType()) &&
-      !isRecordWithAttr<OwnerAttr>(Callee->getThisObjectType()))
+  if (!isRecordWithAttr<PointerAttr>(
+          Callee->getFunctionObjectParameterType()) &&
+      !isRecordWithAttr<OwnerAttr>(Callee->getFunctionObjectParameterType()))
     return false;
   if (Callee->getReturnType()->isPointerType() ||
       isRecordWithAttr<PointerAttr>(Callee->getReturnType())) {
@@ -7557,7 +7558,7 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) {
       QualType LHST;
       auto *MD = dyn_cast<CXXMethodDecl>(FD);
       if (MD && MD->isCXXInstanceMember())
-        LHST = Ctx.getLValueReferenceType(MD->getThisObjectType());
+        LHST = Ctx.getLValueReferenceType(MD->getFunctionObjectParameterType());
       else
         LHST = MD->getParamDecl(0)->getType();
       if (Ctx.hasSameType(RetT, LHST))
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index 1702ddb3ee0fbf..421048aaff5c90 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -379,6 +379,38 @@ buildTypeForLambdaCallOperator(Sema &S, clang::CXXRecordDecl *Class,
   return MethodType;
 }
 
+// [C++2b] [expr.prim.lambda.closure] p4
+//  Given a lambda with a lambda-capture, the type of the explicit object
+//  parameter, if any, of the lambda's function call operator (possibly
+//  instantiated from a function call operator template) shall be either:
+//  - the closure type,
+//  - class type derived from the closure type, or
+//  - a reference to a possibly cv-qualified such type.
+void Sema::DiagnoseInvalidExplicitObjectParameterInLambda(
+    CXXMethodDecl *Method) {
+  if (!isLambdaCallWithExplicitObjectParameter(Method))
+    return;
+  CXXRecordDecl *RD = Method->getParent();
+  if (Method->getType()->isDependentType())
+    return;
+  if (RD->getLambdaCaptureDefault() == LambdaCaptureDefault::LCD_None &&
+      RD->capture_size() == 0)
+    return;
+  QualType ExplicitObjectParameterType = Method->getParamDecl(0)
+                                             ->getType()
+                                             .getNonReferenceType()
+                                             .getUnqualifiedType()
+                                             .getDesugaredType(getASTContext());
+  QualType LambdaType = getASTContext().getRecordType(RD);
+  if (LambdaType == ExplicitObjectParameterType)
+    return;
+  if (IsDerivedFrom(RD->getLocation(), ExplicitObjectParameterType, LambdaType))
+    return;
+  Diag(Method->getParamDecl(0)->getLocation(),
+       diag::err_invalid_explicit_object_type_in_lambda)
+      << ExplicitObjectParameterType;
+}
+
 void Sema::handleLambdaNumbering(
     CXXRecordDecl *Class, CXXMethodDecl *Method,
     std::optional<CXXRecordDecl::LambdaNumbering> NumberingOverride) {
@@ -860,9 +892,17 @@ static TypeSourceInfo *getLambdaType(Sema &S, LambdaIntroducer &Intro,
   if (ParamInfo.getNumTypeObjects() == 0) {
     MethodTyInfo = getDummyLambdaType(S, Loc);
   } else {
+    // Check explicit parameters
+    S.CheckExplicitObjectLambda(ParamInfo);
+
     DeclaratorChunk::FunctionTypeInfo &FTI = ParamInfo.getFunctionTypeInfo();
+
+    bool HasExplicitObjectParameter =
+        ParamInfo.isExplicitObjectMemberFunction();
+
     ExplicitResultType = FTI.hasTrailingReturnType();
-    if (!FTI.hasMutableQualifier() && !IsLambdaStatic)
+    if (!FTI.hasMutableQualifier() && !IsLambdaStatic &&
+        !HasExplicitObjectParameter)
       FTI.getOrCreateMethodQualifiers().SetTypeQual(DeclSpec::TQ_const, Loc);
 
     if (ExplicitResultType && S.getLangOpts().HLSL) {
@@ -1686,7 +1726,7 @@ static void addFunctionPointerConversion(Sema &S, SourceRange IntroducerRange,
   // function that will be the result of the conversion with a
   // certain unique ID.
   // When it is static we just return the static call operator instead.
-  if (CallOperator->isInstance()) {
+  if (CallOperator->isImplicitObjectMemberFunction()) {
     DeclarationName InvokerName =
         &S.Context.Idents.get(getLambdaStaticInvokerName());
     // FIXME: Instead of passing in the CallOperator->getTypeSourceInfo()
@@ -2256,33 +2296,55 @@ ExprResult Sema::BuildBlockForLambdaConversion(SourceLocation CurrentLocation,
   return BuildBlock;
 }
 
+static FunctionDecl *getPatternFunctionDecl(FunctionDecl *FD) {
+  if (FD->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization) {
+    while (FD->getInstantiatedFromMemberFunction())
+      FD = FD->getInstantiatedFromMemberFunction();
+    return FD;
+  }
+
+  if (FD->getTemplatedKind() == FunctionDecl::TK_DependentNonTemplate)
+    return FD->getInstantiatedFromDecl();
+
+  FunctionTemplateDecl *FTD = FD->getPrimaryTemplate();
+  if (!FTD)
+    return nullptr;
+
+  while (FTD->getInstantiatedFromMemberTemplate())
+    FTD = FTD->getInstantiatedFromMemberTemplate();
+
+  return FTD->getTemplatedDecl();
+}
+
 Sema::LambdaScopeForCallOperatorInstantiationRAII::
     LambdaScopeForCallOperatorInstantiationRAII(
-        Sema &SemasRef, FunctionDecl *FD, MultiLevelTemplateArgumentList MLTAL,
-        LocalInstantiationScope &Scope)
-    : FunctionScopeRAII(SemasRef) {
+        Sema &SemaRef, FunctionDecl *FD, MultiLevelTemplateArgumentList MLTAL,
+        LocalInstantiationScope &Scope, bool ShouldAddDeclsFromParentScope)
+    : FunctionScopeRAII(SemaRef) {
   if (!isLambdaCallOperator(FD)) {
     FunctionScopeRAII::disable();
     return;
   }
 
-  if (FD->isTemplateInstantiation() && FD->getPrimaryTemplate()) {
-    FunctionTemplateDecl *PrimaryTemplate = FD->getPrimaryTemplate();
-    if (const auto *FromMemTempl =
-            PrimaryTemplate->getInstantiatedFromMemberTemplate()) {
-      SemasRef.addInstantiatedCapturesToScope(
-          FD, FromMemTempl->getTemplatedDecl(), Scope, MLTAL);
-    }
-  }
+  SemaRef.RebuildLambdaScopeInfo(cast<CXXMethodDecl>(FD));
 
-  else if (FD->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization ||
-           FD->getTemplatedKind() == FunctionDecl::TK_DependentNonTemplate) {
-    FunctionDecl *InstantiatedFrom =
-        FD->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization
-            ? FD->getInstantiatedFromMemberFunction()
-            : FD->getInstantiatedFromDecl();
-    SemasRef.addInstantiatedCapturesToScope(FD, InstantiatedFrom, Scope, MLTAL);
-  }
+  FunctionDecl *Pattern = getPatternFunctionDecl(FD);
+  if (Pattern) {
+    SemaRef.addInstantiatedCapturesToScope(FD, Pattern, Scope, MLTAL);
 
-  SemasRef.RebuildLambdaScopeInfo(cast<CXXMethodDecl>(FD));
+    FunctionDecl *ParentFD = FD;
+    while (ShouldAddDeclsFromParentScope) {
+
+      ParentFD =
+          dyn_cast<FunctionDecl>(getLambdaAwareParentOfDeclContext(ParentFD));
+      Pattern =
+          dyn_cast<FunctionDecl>(getLambdaAwareParentOfDeclContext(Pattern));
+
+      if (!FD || !Pattern)
+        break;
+
+      SemaRef.addInstantiatedParametersToScope(ParentFD, Pattern, Scope, MLTAL);
+      SemaRef.addInstantiatedLocalVarsToScope(ParentFD, Pattern, Scope);
+    }
+  }
 }
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index d42a21976821a4..eb755e2a0c4540 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -17553,6 +17553,9 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_partial:
     Res = ActOnOpenMPPartialClause(nullptr, StartLoc, /*LParenLoc=*/{}, EndLoc);
     break;
+  case OMPC_ompx_bare:
+    Res = ActOnOpenMPXBareClause(StartLoc, EndLoc);
+    break;
   case OMPC_if:
   case OMPC_final:
   case OMPC_num_threads:
@@ -24279,3 +24282,8 @@ OMPClause *Sema::ActOnOpenMPXAttributeClause(ArrayRef<const Attr *> Attrs,
                                              SourceLocation EndLoc) {
   return new (Context) OMPXAttributeClause(Attrs, StartLoc, LParenLoc, EndLoc);
 }
+
+OMPClause *Sema::ActOnOpenMPXBareClause(SourceLocation StartLoc,
+                                        SourceLocation EndLoc) {
+  return new (Context) OMPXBareClause(StartLoc, EndLoc);
+}
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 0ac2ac258c0c7d..ce78994e655381 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -910,6 +910,9 @@ static bool FunctionsCorrespond(ASTContext &Ctx, const FunctionDecl *X,
     return false;
   if (X->getNumParams() != Y->getNumParams())
     return false;
+  // FIXME: when do rewritten comparison operators
+  // with explicit object parameters correspond?
+  // https://cplusplus.github.io/CWG/issues/2797.html
   for (unsigned I = 0; I < X->getNumParams(); ++I)
     if (!Ctx.hasSameUnqualifiedType(X->getParamDecl(I)->getType(),
                                     Y->getParamDecl(I)->getType()))
@@ -994,7 +997,7 @@ bool OverloadCandidateSet::OperatorRewriteInfo::shouldAddReversed(
   }
   // Don't bother adding a reversed candidate that can never be a better
   // match than the non-reversed version.
-  return FD->getNumParams() != 2 ||
+  return FD->getNumNonObjectParams() != 2 ||
          !S.Context.hasSameUnqualifiedType(FD->getParamDecl(0)->getType(),
                                            FD->getParamDecl(1)->getType()) ||
          FD->hasAttr<EnableIfAttr>();
@@ -1235,9 +1238,11 @@ Sema::CheckOverload(Scope *S, FunctionDecl *New, const LookupResult &Old,
   return Ovl_Overload;
 }
 
-bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
-                      bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs,
-                      bool ConsiderRequiresClauses) {
+static bool IsOverloadOrOverrideImpl(Sema &SemaRef, FunctionDecl *New,
+                                     FunctionDecl *Old,
+                                     bool UseMemberUsingDeclRules,
+                                     bool ConsiderCudaAttrs,
+                                     bool UseOverrideRules = false) {
   // C++ [basic.start.main]p2: This function shall not be overloaded.
   if (New->isMain())
     return false;
@@ -1256,8 +1261,8 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
     return true;
 
   // Is the function New an overload of the function Old?
-  QualType OldQType = Context.getCanonicalType(Old->getType());
-  QualType NewQType = Context.getCanonicalType(New->getType());
+  QualType OldQType = SemaRef.Context.getCanonicalType(Old->getType());
+  QualType NewQType = SemaRef.Context.getCanonicalType(New->getType());
 
   // Compare the signatures (C++ 1.3.10) of the two functions to
   // determine whether they are overloads. If we find any mismatch
@@ -1275,10 +1280,7 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
   // The signature of a function includes the types of its
   // parameters (C++ 1.3.10), which includes the presence or absence
   // of the ellipsis; see C++ DR 357).
-  if (OldQType != NewQType &&
-      (OldType->getNumParams() != NewType->getNumParams() ||
-       OldType->isVariadic() != NewType->isVariadic() ||
-       !FunctionParamTypesAreEqual(OldType, NewType)))
+  if (OldQType != NewQType && OldType->isVariadic() != NewType->isVariadic())
     return true;
 
   // For member-like friends, the enclosing class is part of the signature.
@@ -1286,6 +1288,128 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
        Old->isMemberLikeConstrainedFriend()) &&
       !New->getLexicalDeclContext()->Equals(Old->getLexicalDeclContext()))
     return true;
+  const auto *OldMethod = dyn_cast<CXXMethodDecl>(Old);
+  const auto *NewMethod = dyn_cast<CXXMethodDecl>(New);
+
+  int OldParamsOffset = 0;
+  int NewParamsOffset = 0;
+
+  // When determining if a method is an overload from a base class, act as if
+  // the implicit object parameter are of the same type.
+
+  auto NormalizeQualifiers = [&](const CXXMethodDecl *M, Qualifiers Q) {
+    if (M->isExplicitObjectMemberFunction())
+      return Q;
+
+    // We do not allow overloading based off of '__restrict'.
+    Q.removeRestrict();
+
+    // We may not have applied the implicit const for a constexpr member
+    // function yet (because we haven't yet resolved whether this is a static
+    // or non-static member function). Add it now, on the assumption that this
+    // is a redeclaration of OldMethod.
+    if (!SemaRef.getLangOpts().CPlusPlus14 &&
+        (M->isConstexpr() || M->isConsteval()) &&
+        !isa<CXXConstructorDecl>(NewMethod))
+      Q.addConst();
+    return Q;
+  };
+
+  auto CompareType = [&](QualType Base, QualType D) {
+    auto BS = Base.getNonReferenceType().getCanonicalType().split();
+    BS.Quals = NormalizeQualifiers(OldMethod, BS.Quals);
+
+    auto DS = D.getNonReferenceType().getCanonicalType().split();
+    DS.Quals = NormalizeQualifiers(NewMethod, DS.Quals);
+
+    if (BS.Quals != DS.Quals)
+      return false;
+
+    if (OldMethod->isImplicitObjectMemberFunction() &&
+        OldMethod->getParent() != NewMethod->getParent()) {
+      QualType ParentType =
+          SemaRef.Context.getTypeDeclType(OldMethod->getParent())
+              .getCanonicalType();
+      if (ParentType.getTypePtr() != BS.Ty)
+        return false;
+      BS.Ty = DS.Ty;
+    }
+
+    // FIXME: should we ignore some type attributes here?
+    if (BS.Ty != DS.Ty)
+      return false;
+
+    if (Base->isLValueReferenceType())
+      return D->isLValueReferenceType();
+    return Base->isRValueReferenceType() == D->isRValueReferenceType();
+  };
+
+  // If the function is a class member, its signature includes the
+  // cv-qualifiers (if any) and ref-qualifier (if any) on the function itself.
+  auto DiagnoseInconsistentRefQualifiers = [&]() {
+    if (SemaRef.LangOpts.CPlusPlus23)
+      return false;
+    if (OldMethod->getRefQualifier() == NewMethod->getRefQualifier())
+      return false;
+    if (OldMethod->isExplicitObjectMemberFunction() ||
+        NewMethod->isExplicitObjectMemberFunction())
+      return false;
+    if (!UseMemberUsingDeclRules && (OldMethod->getRefQualifier() == RQ_None ||
+                                     NewMethod->getRefQualifier() == RQ_None)) {
+      SemaRef.Diag(NewMethod->getLocation(), diag::err_ref_qualifier_overload)
+          << NewMethod->getRefQualifier() << OldMethod->getRefQualifier();
+      SemaRef.Diag(OldMethod->getLocation(), diag::note_previous_declaration);
+      return true;
+    }
+    return false;
+  };
+
+  if (OldMethod && OldMethod->isExplicitObjectMemberFunction())
+    OldParamsOffset++;
+  if (NewMethod && NewMethod->isExplicitObjectMemberFunction())
+    NewParamsOffset++;
+
+  if (OldType->getNumParams() - OldParamsOffset !=
+          NewType->getNumParams() - NewParamsOffset ||
+      !SemaRef.FunctionParamTypesAreEqual(
+          {OldType->param_type_begin() + OldParamsOffset,
+           OldType->param_type_end()},
+          {NewType->param_type_begin() + NewParamsOffset,
+           NewType->param_type_end()},
+          nullptr)) {
+    return true;
+  }
+
+  if (OldMethod && NewMethod && !OldMethod->isStatic() &&
+      !OldMethod->isStatic()) {
+    bool HaveCorrespondingObjectParameters = [&](const CXXMethodDecl *Old,
+                                                 const CXXMethodDecl *New) {
+      auto NewObjectType = New->getFunctionObjectParameterReferenceType();
+      auto OldObjectType = Old->getFunctionObjectParameterReferenceType();
+
+      auto IsImplicitWithNoRefQual = [](const CXXMethodDecl *F) {
+        return F->getRefQualifier() == RQ_None &&
+               !F->isExplicitObjectMemberFunction();
+      };
+
+      if (IsImplicitWithNoRefQual(Old) != IsImplicitWithNoRefQual(New) &&
+          CompareType(OldObjectType.getNonReferenceType(),
+                      NewObjectType.getNonReferenceType()))
+        return true;
+      return CompareType(OldObjectType, NewObjectType);
+    }(OldMethod, NewMethod);
+
+    if (!HaveCorrespondingObjectParameters) {
+      if (DiagnoseInconsistentRefQualifiers())
+        return true;
+      // CWG2554
+      // and, if at least one is an explicit object member function, ignoring
+      // object parameters
+      if (!UseOverrideRules || (!NewMethod->isExplicitObjectMemberFunction() &&
+                                !OldMethod->isExplicitObjectMemberFunction()))
+        return true;
+    }
+  }
 
   if (NewTemplate) {
     // C++ [temp.over.link]p4:
@@ -1297,11 +1421,11 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
     //
     // We check the return type and template parameter lists for function
     // templates first; the remaining checks follow.
-    bool SameTemplateParameterList = TemplateParameterListsAreEqual(
+    bool SameTemplateParameterList = SemaRef.TemplateParameterListsAreEqual(
         NewTemplate, NewTemplate->getTemplateParameters(), OldTemplate,
-        OldTemplate->getTemplateParameters(), false, TPL_TemplateMatch);
-    bool SameReturnType = Context.hasSameType(Old->getDeclaredReturnType(),
-                                              New->getDeclaredReturnType());
+        OldTemplate->getTemplateParameters(), false, Sema::TPL_TemplateMatch);
+    bool SameReturnType = SemaRef.Context.hasSameType(
+        Old->getDeclaredReturnType(), New->getDeclaredReturnType());
     // FIXME(GH58571): Match template parameter list even for non-constrained
     // template heads. This currently ensures that the code prior to C++20 is
     // not newly broken.
@@ -1324,59 +1448,19 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
       return true;
   }
 
-  if (ConsiderRequiresClauses) {
+  if (!UseOverrideRules) {
     Expr *NewRC = New->getTrailingRequiresClause(),
          *OldRC = Old->getTrailingRequiresClause();
     if ((NewRC != nullptr) != (OldRC != nullptr))
       return true;
 
-    if (NewRC && !AreConstraintExpressionsEqual(Old, OldRC, New, NewRC))
-        return true;
-  }
-
-  // If the function is a class member, its signature includes the
-  // cv-qualifiers (if any) and ref-qualifier (if any) on the function itself.
-  //
-  // As part of this, also check whether one of the member functions
-  // is static, in which case they are not overloads (C++
-  // 13.1p2). While not part of the definition of the signature,
-  // this check is important to determine whether these functions
-  // can be overloaded.
-  CXXMethodDecl *OldMethod = dyn_cast<CXXMethodDecl>(Old);
-  CXXMethodDecl *NewMethod = dyn_cast<CXXMethodDecl>(New);
-  if (OldMethod && NewMethod &&
-      !OldMethod->isStatic() && !NewMethod->isStatic()) {
-    if (OldMethod->getRefQualifier() != NewMethod->getRefQualifier()) {
-      if (!UseMemberUsingDeclRules &&
-          (OldMethod->getRefQualifier() == RQ_None ||
-           NewMethod->getRefQualifier() == RQ_None)) {
-        // C++20 [over.load]p2:
-        //   - Member function declarations with the same name, the same
-        //     parameter-type-list, and the same trailing requires-clause (if
-        //     any), as well as member function template declarations with the
-        //     same name, the same parameter-type-list, the same trailing
-        //     requires-clause (if any), and the same template-head, cannot be
-        //     overloaded if any of them, but not all, have a ref-qualifier.
-        Diag(NewMethod->getLocation(), diag::err_ref_qualifier_overload)
-            << NewMethod->getRefQualifier() << OldMethod->getRefQualifier();
-        Diag(OldMethod->getLocation(), diag::note_previous_declaration);
-      }
+    if (NewRC && !SemaRef.AreConstraintExpressionsEqual(Old, OldRC, New, NewRC))
       return true;
-    }
+  }
 
-    // We may not have applied the implicit const for a constexpr member
-    // function yet (because we haven't yet resolved whether this is a static
-    // or non-static member function). Add it now, on the assumption that this
-    // is a redeclaration of OldMethod.
-    auto OldQuals = OldMethod->getMethodQualifiers();
-    auto NewQuals = NewMethod->getMethodQualifiers();
-    if (!getLangOpts().CPlusPlus14 && NewMethod->isConstexpr() &&
-        !isa<CXXConstructorDecl>(NewMethod))
-      NewQuals.addConst();
-    // We do not allow overloading based off of '__restrict'.
-    OldQuals.removeRestrict();
-    NewQuals.removeRestrict();
-    if (OldQuals != NewQuals)
+  if (NewMethod && OldMethod && OldMethod->isImplicitObjectMemberFunction() &&
+      NewMethod->isImplicitObjectMemberFunction()) {
+    if (DiagnoseInconsistentRefQualifiers())
       return true;
   }
 
@@ -1398,20 +1482,20 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
     if (NewI == NewE || OldI == OldE)
       return true;
     llvm::FoldingSetNodeID NewID, OldID;
-    NewI->getCond()->Profile(NewID, Context, true);
-    OldI->getCond()->Profile(OldID, Context, true);
+    NewI->getCond()->Profile(NewID, SemaRef.Context, true);
+    OldI->getCond()->Profile(OldID, SemaRef.Context, true);
     if (NewID != OldID)
       return true;
   }
 
-  if (getLangOpts().CUDA && ConsiderCudaAttrs) {
+  if (SemaRef.getLangOpts().CUDA && ConsiderCudaAttrs) {
     // Don't allow overloading of destructors.  (In theory we could, but it
     // would be a giant change to clang.)
     if (!isa<CXXDestructorDecl>(New)) {
-      CUDAFunctionTarget NewTarget = IdentifyCUDATarget(New),
-                         OldTarget = IdentifyCUDATarget(Old);
-      if (NewTarget != CFT_InvalidTarget) {
-        assert((OldTarget != CFT_InvalidTarget) &&
+      Sema::CUDAFunctionTarget NewTarget = SemaRef.IdentifyCUDATarget(New),
+                               OldTarget = SemaRef.IdentifyCUDATarget(Old);
+      if (NewTarget != Sema::CFT_InvalidTarget) {
+        assert((OldTarget != Sema::CFT_InvalidTarget) &&
                "Unexpected invalid target.");
 
         // Allow overloading of functions with same signature and different CUDA
@@ -1426,6 +1510,20 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
   return false;
 }
 
+bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
+                      bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs) {
+  return IsOverloadOrOverrideImpl(*this, New, Old, UseMemberUsingDeclRules,
+                                  ConsiderCudaAttrs);
+}
+
+bool Sema::IsOverride(FunctionDecl *MD, FunctionDecl *BaseMD,
+                      bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs) {
+  return IsOverloadOrOverrideImpl(*this, MD, BaseMD,
+                                  /*UseMemberUsingDeclRules=*/false,
+                                  /*ConsiderCudaAttrs=*/true,
+                                  /*UseOverrideRules=*/true);
+}
+
 /// Tries a user-defined conversion from From to ToType.
 ///
 /// Produces an implicit conversion sequence for when a standard conversion
@@ -1884,7 +1982,8 @@ static bool IsStandardConversion(Sema &S, Expr* From, QualType ToType,
       // fact that non-static member functions *must* have such an address-of
       // expression.
       CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(Fn);
-      if (Method && !Method->isStatic()) {
+      if (Method && !Method->isStatic() &&
+          !Method->isExplicitObjectMemberFunction()) {
         assert(isa<UnaryOperator>(From->IgnoreParens()) &&
                "Non-unary operator on non-static member address");
         assert(cast<UnaryOperator>(From->IgnoreParens())->getOpcode()
@@ -3104,30 +3203,40 @@ void Sema::HandleFunctionTypeMismatch(PartialDiagnostic &PDiag,
 /// If `Reversed` is true, the parameters of `NewType` will be compared in
 /// reverse order. That's useful if one of the functions is being used as a C++20
 /// synthesized operator overload with a reversed parameter order.
-bool Sema::FunctionParamTypesAreEqual(const FunctionProtoType *OldType,
-                                      const FunctionProtoType *NewType,
-                                      unsigned *ArgPos, bool Reversed) {
-  assert(OldType->getNumParams() == NewType->getNumParams() &&
+bool Sema::FunctionParamTypesAreEqual(ArrayRef<QualType> Old,
+                                      ArrayRef<QualType> New, unsigned *ArgPos,
+                                      bool Reversed) {
+  assert(llvm::size(Old) == llvm::size(New) &&
          "Can't compare parameters of functions with different number of "
          "parameters!");
-  for (size_t I = 0; I < OldType->getNumParams(); I++) {
+
+  for (auto &&[Idx, Type] : llvm::enumerate(Old)) {
     // Reverse iterate over the parameters of `OldType` if `Reversed` is true.
-    size_t J = Reversed ? (OldType->getNumParams() - I - 1) : I;
+    size_t J = Reversed ? (llvm::size(New) - Idx - 1) : Idx;
 
     // Ignore address spaces in pointee type. This is to disallow overloading
     // on __ptr32/__ptr64 address spaces.
-    QualType Old = Context.removePtrSizeAddrSpace(OldType->getParamType(I).getUnqualifiedType());
-    QualType New = Context.removePtrSizeAddrSpace(NewType->getParamType(J).getUnqualifiedType());
+    QualType OldType =
+        Context.removePtrSizeAddrSpace(Type.getUnqualifiedType());
+    QualType NewType =
+        Context.removePtrSizeAddrSpace((New.begin() + J)->getUnqualifiedType());
 
-    if (!Context.hasSameType(Old, New)) {
+    if (!Context.hasSameType(OldType, NewType)) {
       if (ArgPos)
-        *ArgPos = I;
+        *ArgPos = Idx;
       return false;
     }
   }
   return true;
 }
 
+bool Sema::FunctionParamTypesAreEqual(const FunctionProtoType *OldType,
+                                      const FunctionProtoType *NewType,
+                                      unsigned *ArgPos, bool Reversed) {
+  return FunctionParamTypesAreEqual(OldType->param_types(),
+                                    NewType->param_types(), ArgPos, Reversed);
+}
+
 /// CheckPointerConversion - Check the pointer conversion from the
 /// expression From to the type ToType. This routine checks for
 /// ambiguous or inaccessible derived-to-base pointer
@@ -3532,7 +3641,7 @@ IsInitializerListConstructorConversion(Sema &S, Expr *From, QualType ToType,
   case OR_Success: {
     // Record the standard conversion we used and the conversion function.
     CXXConstructorDecl *Constructor = cast<CXXConstructorDecl>(Best->Function);
-    QualType ThisType = Constructor->getThisObjectType();
+    QualType ThisType = Constructor->getFunctionObjectParameterType();
     // Initializer lists don't have conversions as such.
     User.Before.setAsIdentityConversion();
     User.HadMultipleCandidates = HadMultipleCandidates;
@@ -3734,7 +3843,7 @@ IsUserDefinedConversion(Sema &S, Expr *From, QualType ToType,
       User.ConversionFunction = Constructor;
       User.FoundConversionFunction = Best->FoundDecl;
       User.After.setAsIdentityConversion();
-      User.After.setFromType(Constructor->getThisObjectType());
+      User.After.setFromType(Constructor->getFunctionObjectParameterType());
       User.After.setAllToTypes(ToType);
       return Result;
     }
@@ -5505,11 +5614,45 @@ static bool TryCopyInitialization(const CanQualType FromQTy,
 /// TryObjectArgumentInitialization - Try to initialize the object
 /// parameter of the given member function (@c Method) from the
 /// expression @p From.
-static ImplicitConversionSequence
-TryObjectArgumentInitialization(Sema &S, SourceLocation Loc, QualType FromType,
-                                Expr::Classification FromClassification,
-                                CXXMethodDecl *Method,
-                                CXXRecordDecl *ActingContext) {
+static ImplicitConversionSequence TryObjectArgumentInitialization(
+    Sema &S, SourceLocation Loc, QualType FromType,
+    Expr::Classification FromClassification, CXXMethodDecl *Method,
+    const CXXRecordDecl *ActingContext, bool InOverloadResolution = false,
+    QualType ExplicitParameterType = QualType(),
+    bool SuppressUserConversion = false) {
+
+  // We need to have an object of class type.
+  if (const auto *PT = FromType->getAs<PointerType>()) {
+    FromType = PT->getPointeeType();
+
+    // When we had a pointer, it's implicitly dereferenced, so we
+    // better have an lvalue.
+    assert(FromClassification.isLValue());
+  }
+
+  auto ValueKindFromClassification = [](Expr::Classification C) {
+    if (C.isPRValue())
+      return clang::VK_PRValue;
+    if (C.isXValue())
+      return VK_XValue;
+    return clang::VK_LValue;
+  };
+
+  if (Method->isExplicitObjectMemberFunction()) {
+    if (ExplicitParameterType.isNull())
+      ExplicitParameterType = Method->getFunctionObjectParameterReferenceType();
+    OpaqueValueExpr TmpExpr(Loc, FromType.getNonReferenceType(),
+                            ValueKindFromClassification(FromClassification));
+    ImplicitConversionSequence ICS = TryCopyInitialization(
+        S, &TmpExpr, ExplicitParameterType, SuppressUserConversion,
+        /*InOverloadResolution=*/true, false);
+    if (ICS.isBad())
+      ICS.Bad.FromExpr = nullptr;
+    return ICS;
+  }
+
+  assert(FromType->isRecordType());
+
   QualType ClassType = S.Context.getTypeDeclType(ActingContext);
   // [class.dtor]p2: A destructor can be invoked for a const, volatile or
   //                 const volatile object.
@@ -5525,17 +5668,6 @@ TryObjectArgumentInitialization(Sema &S, SourceLocation Loc, QualType FromType,
   // to exit early.
   ImplicitConversionSequence ICS;
 
-  // We need to have an object of class type.
-  if (const PointerType *PT = FromType->getAs<PointerType>()) {
-    FromType = PT->getPointeeType();
-
-    // When we had a pointer, it's implicitly dereferenced, so we
-    // better have an lvalue.
-    assert(FromClassification.isLValue());
-  }
-
-  assert(FromType->isRecordType());
-
   // C++0x [over.match.funcs]p4:
   //   For non-static member functions, the type of the implicit object
   //   parameter is
@@ -5582,9 +5714,9 @@ TryObjectArgumentInitialization(Sema &S, SourceLocation Loc, QualType FromType,
   ImplicitConversionKind SecondKind;
   if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) {
     SecondKind = ICK_Identity;
-  } else if (S.IsDerivedFrom(Loc, FromType, ClassType))
+  } else if (S.IsDerivedFrom(Loc, FromType, ClassType)) {
     SecondKind = ICK_Derived_To_Base;
-  else {
+  } else if (!Method->isExplicitObjectMemberFunction()) {
     ICS.setBad(BadConversionSequence::unrelated_class,
                FromType, ImplicitParamType);
     return ICS;
@@ -5634,13 +5766,11 @@ TryObjectArgumentInitialization(Sema &S, SourceLocation Loc, QualType FromType,
 /// PerformObjectArgumentInitialization - Perform initialization of
 /// the implicit object parameter for the given Method with the given
 /// expression.
-ExprResult
-Sema::PerformObjectArgumentInitialization(Expr *From,
-                                          NestedNameSpecifier *Qualifier,
-                                          NamedDecl *FoundDecl,
-                                          CXXMethodDecl *Method) {
+ExprResult Sema::PerformImplicitObjectArgumentInitialization(
+    Expr *From, NestedNameSpecifier *Qualifier, NamedDecl *FoundDecl,
+    CXXMethodDecl *Method) {
   QualType FromRecordType, DestType;
-  QualType ImplicitParamRecordType = Method->getThisObjectType();
+  QualType ImplicitParamRecordType = Method->getFunctionObjectParameterType();
 
   Expr::Classification FromClassification;
   if (const PointerType *PT = From->getType()->getAs<PointerType>()) {
@@ -6134,6 +6264,64 @@ ExprResult Sema::PerformContextuallyConvertToObjCPointer(Expr *From) {
   return ExprResult();
 }
 
+static QualType GetExplicitObjectType(Sema &S, const Expr *MemExprE) {
+  const Expr *Base = nullptr;
+  assert((isa<UnresolvedMemberExpr, MemberExpr>(MemExprE)) &&
+         "expected a member expression");
+
+  if (const auto M = dyn_cast<UnresolvedMemberExpr>(MemExprE);
+      M && !M->isImplicitAccess())
+    Base = M->getBase();
+  else if (const auto M = dyn_cast<MemberExpr>(MemExprE);
+           M && !M->isImplicitAccess())
+    Base = M->getBase();
+
+  QualType T = Base ? Base->getType() : S.getCurrentThisType();
+
+  if (T->isPointerType())
+    T = T->getPointeeType();
+
+  return T;
+}
+
+static Expr *GetExplicitObjectExpr(Sema &S, Expr *Obj,
+                                   const FunctionDecl *Fun) {
+  QualType ObjType = Obj->getType();
+  if (ObjType->isPointerType()) {
+    ObjType = ObjType->getPointeeType();
+    Obj = UnaryOperator::Create(S.getASTContext(), Obj, UO_Deref, ObjType,
+                                VK_LValue, OK_Ordinary, SourceLocation(),
+                                /*CanOverflow=*/false, FPOptionsOverride());
+  }
+  if (Obj->Classify(S.getASTContext()).isPRValue()) {
+    Obj = S.CreateMaterializeTemporaryExpr(
+        ObjType, Obj,
+        !Fun->getParamDecl(0)->getType()->isRValueReferenceType());
+  }
+  return Obj;
+}
+
+ExprResult Sema::InitializeExplicitObjectArgument(Sema &S, Expr *Obj,
+                                                  FunctionDecl *Fun) {
+  Obj = GetExplicitObjectExpr(S, Obj, Fun);
+  return S.PerformCopyInitialization(
+      InitializedEntity::InitializeParameter(S.Context, Fun->getParamDecl(0)),
+      Obj->getExprLoc(), Obj);
+}
+
+static void PrepareExplicitObjectArgument(Sema &S, CXXMethodDecl *Method,
+                                          Expr *Object, MultiExprArg &Args,
+                                          SmallVectorImpl<Expr *> &NewArgs) {
+  assert(Method->isExplicitObjectMemberFunction() &&
+         "Method is not an explicit member function");
+  assert(NewArgs.empty() && "NewArgs should be empty");
+  NewArgs.reserve(Args.size() + 1);
+  Expr *This = GetExplicitObjectExpr(S, Object, Method);
+  NewArgs.push_back(This);
+  NewArgs.append(Args.begin(), Args.end());
+  Args = NewArgs;
+}
+
 /// Determine whether the provided type is an integral type, or an enumeration
 /// type of a permitted flavor.
 bool Sema::ICEConvertDiagnoser::match(QualType T) {
@@ -6868,7 +7056,7 @@ static bool convertArgsForAvailabilityChecks(
     assert(!isa<CXXConstructorDecl>(Method) &&
            "Shouldn't have `this` for ctors!");
     assert(!Method->isStatic() && "Shouldn't have `this` for static methods!");
-    ExprResult R = S.PerformObjectArgumentInitialization(
+    ExprResult R = S.PerformImplicitObjectArgumentInitialization(
         ThisArg, /*Qualifier=*/nullptr, Method, Method);
     if (R.isInvalid())
       return false;
@@ -7168,7 +7356,8 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
   Candidate.IgnoreObjectArgument = false;
   Candidate.ExplicitCallArguments = Args.size();
 
-  unsigned NumParams = Proto->getNumParams();
+  unsigned NumParams = Method->getNumExplicitParams();
+  unsigned ExplicitOffset = Method->isExplicitObjectMemberFunction() ? 1 : 0;
 
   // (C++ 13.3.2p2): A candidate function having fewer than m
   // parameters is viable only if it has an ellipsis in its parameter
@@ -7186,7 +7375,7 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
   // (8.3.6). For the purposes of overload resolution, the
   // parameter list is truncated on the right, so that there are
   // exactly m parameters.
-  unsigned MinRequiredArgs = Method->getMinRequiredArguments();
+  unsigned MinRequiredArgs = Method->getMinRequiredExplicitArguments();
   if (Args.size() < MinRequiredArgs && !PartialOverloading) {
     // Not enough arguments.
     Candidate.Viable = false;
@@ -7215,7 +7404,7 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
     // parameter.
     Candidate.Conversions[FirstConvIdx] = TryObjectArgumentInitialization(
         *this, CandidateSet.getLocation(), ObjectType, ObjectClassification,
-        Method, ActingContext);
+        Method, ActingContext, /*InOverloadResolution=*/true);
     if (Candidate.Conversions[FirstConvIdx].isBad()) {
       Candidate.Viable = false;
       Candidate.FailureKind = ovl_fail_bad_conversion;
@@ -7255,7 +7444,7 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
       // exist for each argument an implicit conversion sequence
       // (13.3.3.1) that converts that argument to the corresponding
       // parameter of F.
-      QualType ParamType = Proto->getParamType(ArgIdx);
+      QualType ParamType = Proto->getParamType(ArgIdx + ExplicitOffset);
       Candidate.Conversions[ConvIdx]
         = TryCopyInitialization(*this, Args[ArgIdx], ParamType,
                                 SuppressUserConversions,
@@ -7320,8 +7509,8 @@ void Sema::AddMethodTemplateCandidate(
   ConversionSequenceList Conversions;
   if (TemplateDeductionResult Result = DeduceTemplateArguments(
           MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info,
-          PartialOverloading, /*AggregateDeductionCandidate=*/false,
-          [&](ArrayRef<QualType> ParamTypes) {
+          PartialOverloading, /*AggregateDeductionCandidate=*/false, ObjectType,
+          ObjectClassification, [&](ArrayRef<QualType> ParamTypes) {
             return CheckNonDependentConversions(
                 MethodTmpl, ParamTypes, Args, CandidateSet, Conversions,
                 SuppressUserConversions, ActingContext, ObjectType,
@@ -7405,6 +7594,8 @@ void Sema::AddTemplateOverloadCandidate(
   if (TemplateDeductionResult Result = DeduceTemplateArguments(
           FunctionTemplate, ExplicitTemplateArgs, Args, Specialization, Info,
           PartialOverloading, AggregateCandidateDeduction,
+          /*ObjectType=*/QualType(),
+          /*ObjectClassification=*/Expr::Classification(),
           [&](ArrayRef<QualType> ParamTypes) {
             return CheckNonDependentConversions(
                 FunctionTemplate, ParamTypes, Args, CandidateSet, Conversions,
@@ -7477,16 +7668,24 @@ bool Sema::CheckNonDependentConversions(
   if (HasThisConversion && !cast<CXXMethodDecl>(FD)->isStatic() &&
       !ObjectType.isNull()) {
     unsigned ConvIdx = PO == OverloadCandidateParamOrder::Reversed ? 1 : 0;
-    Conversions[ConvIdx] = TryObjectArgumentInitialization(
-        *this, CandidateSet.getLocation(), ObjectType, ObjectClassification,
-        Method, ActingContext);
-    if (Conversions[ConvIdx].isBad())
-      return true;
+    if (!FD->hasCXXExplicitFunctionObjectParameter() ||
+        !ParamTypes[0]->isDependentType()) {
+      Conversions[ConvIdx] = TryObjectArgumentInitialization(
+          *this, CandidateSet.getLocation(), ObjectType, ObjectClassification,
+          Method, ActingContext, /*InOverloadResolution=*/true,
+          FD->hasCXXExplicitFunctionObjectParameter() ? ParamTypes[0]
+                                                      : QualType());
+      if (Conversions[ConvIdx].isBad())
+        return true;
+    }
   }
 
+  unsigned Offset =
+      Method && Method->hasCXXExplicitFunctionObjectParameter() ? 1 : 0;
+
   for (unsigned I = 0, N = std::min(ParamTypes.size(), Args.size()); I != N;
        ++I) {
-    QualType ParamType = ParamTypes[I];
+    QualType ParamType = ParamTypes[I + Offset];
     if (!ParamType->isDependentType()) {
       unsigned ConvIdx = PO == OverloadCandidateParamOrder::Reversed
                              ? 0
@@ -7619,15 +7818,21 @@ void Sema::AddConversionCandidate(
   //
   // Determine the implicit conversion sequence for the implicit
   // object parameter.
-  QualType ImplicitParamType = From->getType();
-  if (const PointerType *FromPtrType = ImplicitParamType->getAs<PointerType>())
-    ImplicitParamType = FromPtrType->getPointeeType();
-  CXXRecordDecl *ConversionContext
-    = cast<CXXRecordDecl>(ImplicitParamType->castAs<RecordType>()->getDecl());
-
+  QualType ObjectType = From->getType();
+  if (const auto *FromPtrType = ObjectType->getAs<PointerType>())
+    ObjectType = FromPtrType->getPointeeType();
+  const auto *ConversionContext =
+      cast<CXXRecordDecl>(ObjectType->castAs<RecordType>()->getDecl());
+
+  // C++23 [over.best.ics.general]
+  // However, if the target is [...]
+  // - the object parameter of a user-defined conversion function
+  // [...] user-defined conversion sequences are not considered.
   Candidate.Conversions[0] = TryObjectArgumentInitialization(
       *this, CandidateSet.getLocation(), From->getType(),
-      From->Classify(Context), Conversion, ConversionContext);
+      From->Classify(Context), Conversion, ConversionContext,
+      /*InOverloadResolution*/ false, /*ExplicitParameterType=*/QualType(),
+      /*SuppressUserConversion*/ true);
 
   if (Candidate.Conversions[0].isBad()) {
     Candidate.Viable = false;
@@ -7781,11 +7986,14 @@ void Sema::AddTemplateConversionCandidate(
     return;
   }
 
+  QualType ObjectType = From->getType();
+  Expr::Classification ObjectClassification = From->Classify(getASTContext());
+
   TemplateDeductionInfo Info(CandidateSet.getLocation());
   CXXConversionDecl *Specialization = nullptr;
-  if (TemplateDeductionResult Result
-        = DeduceTemplateArguments(FunctionTemplate, ToType,
-                                  Specialization, Info)) {
+  if (TemplateDeductionResult Result = DeduceTemplateArguments(
+          FunctionTemplate, ObjectType, ObjectClassification, ToType,
+          Specialization, Info)) {
     OverloadCandidate &Candidate = CandidateSet.addCandidate();
     Candidate.FoundDecl = FoundDecl;
     Candidate.Function = FunctionTemplate->getTemplatedDecl();
@@ -7837,9 +8045,18 @@ void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion,
 
   // Determine the implicit conversion sequence for the implicit
   // object parameter.
-  ImplicitConversionSequence ObjectInit = TryObjectArgumentInitialization(
-      *this, CandidateSet.getLocation(), Object->getType(),
-      Object->Classify(Context), Conversion, ActingContext);
+  ImplicitConversionSequence ObjectInit;
+  if (Conversion->hasCXXExplicitFunctionObjectParameter()) {
+    ObjectInit = TryCopyInitialization(*this, Object,
+                                       Conversion->getParamDecl(0)->getType(),
+                                       /*SuppressUserConversions=*/false,
+                                       /*InOverloadResolution=*/true, false);
+  } else {
+    ObjectInit = TryObjectArgumentInitialization(
+        *this, CandidateSet.getLocation(), Object->getType(),
+        Object->Classify(Context), Conversion, ActingContext);
+  }
+
   if (ObjectInit.isBad()) {
     Candidate.Viable = false;
     Candidate.FailureKind = ovl_fail_bad_conversion;
@@ -9865,11 +10082,7 @@ getImplicitObjectParamType(ASTContext &Context, const FunctionDecl *F) {
   // Static member functions' object parameters match all types.
   if (M->isStatic())
     return QualType();
-
-  QualType T = M->getThisObjectType();
-  if (M->getRefQualifier() == RQ_RValue)
-    return Context.getRValueReferenceType(T);
-  return Context.getLValueReferenceType(T);
+  return M->getFunctionObjectParameterReferenceType();
 }
 
 static bool haveSameParameterTypes(ASTContext &Context, const FunctionDecl *F1,
@@ -10227,6 +10440,21 @@ bool clang::isBetterOverloadCandidate(
       //  -- F1 is the copy deduction candidate(16.3.1.8) and F2 is not
       if (Guide1->getDeductionCandidateKind() == DeductionCandidate::Copy)
         return true;
+      if (Guide2->getDeductionCandidateKind() == DeductionCandidate::Copy)
+        return false;
+
+      //  --F1 is generated from a non-template constructor and F2 is generated
+      //  from a constructor template
+      const auto *Constructor1 = Guide1->getCorrespondingConstructor();
+      const auto *Constructor2 = Guide2->getCorrespondingConstructor();
+      if (Constructor1 && Constructor2) {
+        bool isC1Templated = Constructor1->getTemplatedKind() !=
+                             FunctionDecl::TemplatedKind::TK_NonTemplate;
+        bool isC2Templated = Constructor2->getTemplatedKind() !=
+                             FunctionDecl::TemplatedKind::TK_NonTemplate;
+        if (isC1Templated != isC2Templated)
+          return isC2Templated;
+      }
     }
   }
 
@@ -10270,7 +10498,7 @@ bool clang::isBetterOverloadCandidate(
     if (AS1 != AS2) {
       if (Qualifiers::isAddressSpaceSupersetOf(AS2, AS1))
         return true;
-      if (Qualifiers::isAddressSpaceSupersetOf(AS2, AS1))
+      if (Qualifiers::isAddressSpaceSupersetOf(AS1, AS2))
         return false;
     }
   }
@@ -11072,39 +11300,43 @@ static void DiagnoseArityMismatch(Sema &S, NamedDecl *Found, Decl *D,
 
   // TODO: treat calls to a missing default constructor as a special case
   const auto *FnTy = Fn->getType()->castAs<FunctionProtoType>();
-  unsigned MinParams = Fn->getMinRequiredArguments();
+  unsigned MinParams = Fn->getMinRequiredExplicitArguments();
 
   // at least / at most / exactly
+  bool HasExplicitObjectParam = Fn->hasCXXExplicitFunctionObjectParameter();
+  unsigned ParamCount = FnTy->getNumParams() - (HasExplicitObjectParam ? 1 : 0);
   unsigned mode, modeCount;
   if (NumFormalArgs < MinParams) {
-    if (MinParams != FnTy->getNumParams() || FnTy->isVariadic() ||
+    if (MinParams != ParamCount || FnTy->isVariadic() ||
         FnTy->isTemplateVariadic())
       mode = 0; // "at least"
     else
       mode = 2; // "exactly"
     modeCount = MinParams;
   } else {
-    if (MinParams != FnTy->getNumParams())
+    if (MinParams != ParamCount)
       mode = 1; // "at most"
     else
       mode = 2; // "exactly"
-    modeCount = FnTy->getNumParams();
+    modeCount = ParamCount;
   }
 
   std::string Description;
   std::pair<OverloadCandidateKind, OverloadCandidateSelect> FnKindPair =
       ClassifyOverloadCandidate(S, Found, Fn, CRK_None, Description);
 
-  if (modeCount == 1 && Fn->getParamDecl(0)->getDeclName())
+  if (modeCount == 1 &&
+      Fn->getParamDecl(HasExplicitObjectParam ? 1 : 0)->getDeclName())
     S.Diag(Fn->getLocation(), diag::note_ovl_candidate_arity_one)
         << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second
-        << Description << mode << Fn->getParamDecl(0) << NumFormalArgs
-        << Fn->getParametersSourceRange();
+        << Description << mode
+        << Fn->getParamDecl(HasExplicitObjectParam ? 1 : 0) << NumFormalArgs
+        << HasExplicitObjectParam << Fn->getParametersSourceRange();
   else
     S.Diag(Fn->getLocation(), diag::note_ovl_candidate_arity)
         << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second
         << Description << mode << modeCount << NumFormalArgs
-        << Fn->getParametersSourceRange();
+        << HasExplicitObjectParam << Fn->getParametersSourceRange();
 
   MaybeEmitInheritedConstructorNote(S, Found);
 }
@@ -12448,7 +12680,9 @@ class AddressOfFunctionResolver {
               = dyn_cast<CXXMethodDecl>(FunctionTemplate->getTemplatedDecl())) {
       // Skip non-static function templates when converting to pointer, and
       // static when converting to member pointer.
-      if (Method->isStatic() == TargetTypeIsNonStaticMemberFunction)
+      bool CanConvertToFunctionPointer =
+          Method->isStatic() || Method->isExplicitObjectMemberFunction();
+      if (CanConvertToFunctionPointer == TargetTypeIsNonStaticMemberFunction)
         return false;
     }
     else if (TargetTypeIsNonStaticMemberFunction)
@@ -12493,7 +12727,9 @@ class AddressOfFunctionResolver {
     if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(Fn)) {
       // Skip non-static functions when converting to pointer, and static
       // when converting to member pointer.
-      if (Method->isStatic() == TargetTypeIsNonStaticMemberFunction)
+      bool CanConvertToFunctionPointer =
+          Method->isStatic() || Method->isExplicitObjectMemberFunction();
+      if (CanConvertToFunctionPointer == TargetTypeIsNonStaticMemberFunction)
         return false;
     }
     else if (TargetTypeIsNonStaticMemberFunction)
@@ -12895,7 +13131,10 @@ bool Sema::resolveAndFixAddressOfSingleOverloadCandidate(
   // for both.
   DiagnoseUseOfDecl(Found, E->getExprLoc());
   CheckAddressOfMemberAccess(E, DAP);
-  Expr *Fixed = FixOverloadedFunctionReference(E, DAP, Found);
+  ExprResult Res = FixOverloadedFunctionReference(E, DAP, Found);
+  if (Res.isInvalid())
+    return false;
+  Expr *Fixed = Res.get();
   if (DoFunctionPointerConversion && Fixed->getType()->isFunctionType())
     SrcExpr = DefaultFunctionArrayConversion(Fixed, /*Diagnose=*/false);
   else
@@ -13557,10 +13796,13 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn,
     SemaRef.CheckUnresolvedLookupAccess(ULE, (*Best)->FoundDecl);
     if (SemaRef.DiagnoseUseOfDecl(FDecl, ULE->getNameLoc()))
       return ExprError();
-    Fn = SemaRef.FixOverloadedFunctionReference(Fn, (*Best)->FoundDecl, FDecl);
-    return SemaRef.BuildResolvedCallExpr(Fn, FDecl, LParenLoc, Args, RParenLoc,
-                                         ExecConfig, /*IsExecConfig=*/false,
-                                         (*Best)->IsADLCandidate);
+    ExprResult Res =
+        SemaRef.FixOverloadedFunctionReference(Fn, (*Best)->FoundDecl, FDecl);
+    if (Res.isInvalid())
+      return ExprError();
+    return SemaRef.BuildResolvedCallExpr(
+        Res.get(), FDecl, LParenLoc, Args, RParenLoc, ExecConfig,
+        /*IsExecConfig=*/false, (*Best)->IsADLCandidate);
   }
 
   case OR_No_Viable_Function: {
@@ -13615,10 +13857,13 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn,
     // We emitted an error for the unavailable/deleted function call but keep
     // the call in the AST.
     FunctionDecl *FDecl = (*Best)->Function;
-    Fn = SemaRef.FixOverloadedFunctionReference(Fn, (*Best)->FoundDecl, FDecl);
-    return SemaRef.BuildResolvedCallExpr(Fn, FDecl, LParenLoc, Args, RParenLoc,
-                                         ExecConfig, /*IsExecConfig=*/false,
-                                         (*Best)->IsADLCandidate);
+    ExprResult Res =
+        SemaRef.FixOverloadedFunctionReference(Fn, (*Best)->FoundDecl, FDecl);
+    if (Res.isInvalid())
+      return ExprError();
+    return SemaRef.BuildResolvedCallExpr(
+        Res.get(), FDecl, LParenLoc, Args, RParenLoc, ExecConfig,
+        /*IsExecConfig=*/false, (*Best)->IsADLCandidate);
   }
   }
 
@@ -13692,6 +13937,83 @@ ExprResult Sema::CreateUnresolvedLookupExpr(CXXRecordDecl *NamingClass,
                                       Fns.begin(), Fns.end());
 }
 
+ExprResult Sema::BuildCXXMemberCallExpr(Expr *E, NamedDecl *FoundDecl,
+                                        CXXConversionDecl *Method,
+                                        bool HadMultipleCandidates) {
+  // Convert the expression to match the conversion function's implicit object
+  // parameter.
+  ExprResult Exp;
+  if (Method->isExplicitObjectMemberFunction())
+    Exp = InitializeExplicitObjectArgument(*this, E, Method);
+  else
+    Exp = PerformImplicitObjectArgumentInitialization(E, /*Qualifier=*/nullptr,
+                                                      FoundDecl, Method);
+  if (Exp.isInvalid())
+    return true;
+
+  if (Method->getParent()->isLambda() &&
+      Method->getConversionType()->isBlockPointerType()) {
+    // This is a lambda conversion to block pointer; check if the argument
+    // was a LambdaExpr.
+    Expr *SubE = E;
+    auto *CE = dyn_cast<CastExpr>(SubE);
+    if (CE && CE->getCastKind() == CK_NoOp)
+      SubE = CE->getSubExpr();
+    SubE = SubE->IgnoreParens();
+    if (auto *BE = dyn_cast<CXXBindTemporaryExpr>(SubE))
+      SubE = BE->getSubExpr();
+    if (isa<LambdaExpr>(SubE)) {
+      // For the conversion to block pointer on a lambda expression, we
+      // construct a special BlockLiteral instead; this doesn't really make
+      // a difference in ARC, but outside of ARC the resulting block literal
+      // follows the normal lifetime rules for block literals instead of being
+      // autoreleased.
+      PushExpressionEvaluationContext(
+          ExpressionEvaluationContext::PotentiallyEvaluated);
+      ExprResult BlockExp = BuildBlockForLambdaConversion(
+          Exp.get()->getExprLoc(), Exp.get()->getExprLoc(), Method, Exp.get());
+      PopExpressionEvaluationContext();
+
+      // FIXME: This note should be produced by a CodeSynthesisContext.
+      if (BlockExp.isInvalid())
+        Diag(Exp.get()->getExprLoc(), diag::note_lambda_to_block_conv);
+      return BlockExp;
+    }
+  }
+  CallExpr *CE;
+  QualType ResultType = Method->getReturnType();
+  ExprValueKind VK = Expr::getValueKindForType(ResultType);
+  ResultType = ResultType.getNonLValueExprType(Context);
+  if (Method->isExplicitObjectMemberFunction()) {
+    ExprResult FnExpr =
+        CreateFunctionRefExpr(*this, Method, FoundDecl, Exp.get(),
+                              HadMultipleCandidates, E->getBeginLoc());
+    if (FnExpr.isInvalid())
+      return ExprError();
+    Expr *ObjectParam = Exp.get();
+    CE = CallExpr::Create(Context, FnExpr.get(), MultiExprArg(&ObjectParam, 1),
+                          ResultType, VK, Exp.get()->getEndLoc(),
+                          CurFPFeatureOverrides());
+  } else {
+    MemberExpr *ME =
+        BuildMemberExpr(Exp.get(), /*IsArrow=*/false, SourceLocation(),
+                        NestedNameSpecifierLoc(), SourceLocation(), Method,
+                        DeclAccessPair::make(FoundDecl, FoundDecl->getAccess()),
+                        HadMultipleCandidates, DeclarationNameInfo(),
+                        Context.BoundMemberTy, VK_PRValue, OK_Ordinary);
+
+    CE = CXXMemberCallExpr::Create(Context, ME, /*Args=*/{}, ResultType, VK,
+                                   Exp.get()->getEndLoc(),
+                                   CurFPFeatureOverrides());
+  }
+
+  if (CheckFunctionCall(Method, CE,
+                        Method->getType()->castAs<FunctionProtoType>()))
+    return ExprError();
+
+  return CheckForImmediateInvocation(CE, CE->getDirectCallee());
+}
+
 /// Create a unary operation that may resolve to an overloaded
 /// operator.
 ///
@@ -13786,14 +14108,17 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
 
       // Convert the arguments.
       if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(FnDecl)) {
-        CheckMemberOperatorAccess(OpLoc, Args[0], nullptr, Best->FoundDecl);
+        CheckMemberOperatorAccess(OpLoc, Input, nullptr, Best->FoundDecl);
 
-        ExprResult InputRes =
-          PerformObjectArgumentInitialization(Input, /*Qualifier=*/nullptr,
-                                              Best->FoundDecl, Method);
-        if (InputRes.isInvalid())
+        ExprResult InputInit;
+        if (Method->isExplicitObjectMemberFunction())
+          InputInit = InitializeExplicitObjectArgument(*this, Input, Method);
+        else
+          InputInit = PerformImplicitObjectArgumentInitialization(
+              Input, /*Qualifier=*/nullptr, Best->FoundDecl, Method);
+        if (InputInit.isInvalid())
           return ExprError();
-        Base = Input = InputRes.get();
+        Base = Input = InputInit.get();
       } else {
         // Convert the arguments.
         ExprResult InputInit
@@ -14127,13 +14452,16 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
               if (auto *MD = dyn_cast<CXXMethodDecl>(FnDecl))
                 if (Op == OverloadedOperatorKind::OO_EqualEqual &&
                     !MD->isConst() &&
+                    !MD->hasCXXExplicitFunctionObjectParameter() &&
                     Context.hasSameUnqualifiedType(
-                        MD->getThisObjectType(),
+                        MD->getFunctionObjectParameterType(),
                         MD->getParamDecl(0)->getType().getNonReferenceType()) &&
-                    Context.hasSameUnqualifiedType(MD->getThisObjectType(),
-                                                   Args[0]->getType()) &&
-                    Context.hasSameUnqualifiedType(MD->getThisObjectType(),
-                                                   Args[1]->getType()))
+                    Context.hasSameUnqualifiedType(
+                        MD->getFunctionObjectParameterType(),
+                        Args[0]->getType()) &&
+                    Context.hasSameUnqualifiedType(
+                        MD->getFunctionObjectParameterType(),
+                        Args[1]->getType()))
                   Diag(FnDecl->getLocation(),
                        diag::note_ovl_ambiguous_eqeq_reversed_self_non_const);
             } else {
@@ -14151,19 +14479,22 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
           // Best->Access is only meaningful for class members.
           CheckMemberOperatorAccess(OpLoc, Args[0], Args[1], Best->FoundDecl);
 
-          ExprResult Arg1 =
-            PerformCopyInitialization(
-              InitializedEntity::InitializeParameter(Context,
-                                                     FnDecl->getParamDecl(0)),
+          ExprResult Arg0, Arg1;
+          unsigned ParamIdx = 0;
+          if (Method->isExplicitObjectMemberFunction()) {
+            Arg0 = InitializeExplicitObjectArgument(*this, Args[0], FnDecl);
+            ParamIdx = 1;
+          } else {
+            Arg0 = PerformImplicitObjectArgumentInitialization(
+                Args[0], /*Qualifier=*/nullptr, Best->FoundDecl, Method);
+          }
+          Arg1 = PerformCopyInitialization(
+              InitializedEntity::InitializeParameter(
+                  Context, FnDecl->getParamDecl(ParamIdx)),
               SourceLocation(), Args[1]);
-          if (Arg1.isInvalid())
+          if (Arg0.isInvalid() || Arg1.isInvalid())
             return ExprError();
 
-          ExprResult Arg0 =
-            PerformObjectArgumentInitialization(Args[0], /*Qualifier=*/nullptr,
-                                                Best->FoundDecl, Method);
-          if (Arg0.isInvalid())
-            return ExprError();
           Base = Args[0] = Arg0.getAs<Expr>();
           Args[1] = RHS = Arg1.getAs<Expr>();
         } else {
@@ -14198,22 +14529,27 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
         ExprValueKind VK = Expr::getValueKindForType(ResultTy);
         ResultTy = ResultTy.getNonLValueExprType(Context);
 
-        CXXOperatorCallExpr *TheCall = CXXOperatorCallExpr::Create(
+        CallExpr *TheCall;
+        ArrayRef<const Expr *> ArgsArray(Args, 2);
+        const Expr *ImplicitThis = nullptr;
+
+        // We always create a CXXOperatorCallExpr, even for explicit object
+        // members; CodeGen should take care not to emit the this pointer.
+        TheCall = CXXOperatorCallExpr::Create(
             Context, ChosenOp, FnExpr.get(), Args, ResultTy, VK, OpLoc,
             CurFPFeatureOverrides(), Best->IsADLCandidate);
 
-        if (CheckCallReturnType(FnDecl->getReturnType(), OpLoc, TheCall,
-                                FnDecl))
-          return ExprError();
-
-        ArrayRef<const Expr *> ArgsArray(Args, 2);
-        const Expr *ImplicitThis = nullptr;
-        // Cut off the implicit 'this'.
-        if (isa<CXXMethodDecl>(FnDecl)) {
+        if (const auto *Method = dyn_cast<CXXMethodDecl>(FnDecl);
+            Method && Method->isImplicitObjectMemberFunction()) {
+          // Cut off the implicit 'this'.
           ImplicitThis = ArgsArray[0];
           ArgsArray = ArgsArray.slice(1);
         }
 
+        if (CheckCallReturnType(FnDecl->getReturnType(), OpLoc, TheCall,
+                                FnDecl))
+          return ExprError();
+
         // Check for a self move.
         if (Op == OO_Equal)
           DiagnoseSelfMove(Args[0], Args[1], OpLoc);
@@ -14221,7 +14557,7 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
         if (ImplicitThis) {
           QualType ThisType = Context.getPointerType(ImplicitThis->getType());
           QualType ThisTypeFromDecl = Context.getPointerType(
-              cast<CXXMethodDecl>(FnDecl)->getThisObjectType());
+              cast<CXXMethodDecl>(FnDecl)->getFunctionObjectParameterType());
 
           CheckArgAlignment(OpLoc, FnDecl, "'this'", ThisType,
                             ThisTypeFromDecl);
@@ -14600,8 +14936,15 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc,
         SmallVector<Expr *, 2> MethodArgs;
 
         // Handle 'this' parameter if the selected function is not static.
-        if (Method->isInstance()) {
-          ExprResult Arg0 = PerformObjectArgumentInitialization(
+        if (Method->isExplicitObjectMemberFunction()) {
+          ExprResult Res =
+              InitializeExplicitObjectArgument(*this, Args[0], Method);
+          if (Res.isInvalid())
+            return ExprError();
+          Args[0] = Res.get();
+          ArgExpr = Args;
+        } else if (Method->isInstance()) {
+          ExprResult Arg0 = PerformImplicitObjectArgumentInitialization(
               Args[0], /*Qualifier=*/nullptr, Best->FoundDecl, Method);
           if (Arg0.isInvalid())
             return ExprError();
@@ -14803,6 +15146,7 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
 
   MemberExpr *MemExpr;
   CXXMethodDecl *Method = nullptr;
+  bool HadMultipleCandidates = false;
   DeclAccessPair FoundDecl = DeclAccessPair::make(nullptr, AS_public);
   NestedNameSpecifier *Qualifier = nullptr;
   if (isa<MemberExpr>(NakedMemExpr)) {
@@ -14834,11 +15178,24 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
     for (UnresolvedMemberExpr::decls_iterator I = UnresExpr->decls_begin(),
            E = UnresExpr->decls_end(); I != E; ++I) {
 
+      QualType ExplicitObjectType = ObjectType;
+
       NamedDecl *Func = *I;
       CXXRecordDecl *ActingDC = cast<CXXRecordDecl>(Func->getDeclContext());
       if (isa<UsingShadowDecl>(Func))
         Func = cast<UsingShadowDecl>(Func)->getTargetDecl();
 
+      bool HasExplicitParameter = false;
+      if (const auto *M = dyn_cast<FunctionDecl>(Func);
+          M && M->hasCXXExplicitFunctionObjectParameter())
+        HasExplicitParameter = true;
+      else if (const auto *M = dyn_cast<FunctionTemplateDecl>(Func);
+               M &&
+               M->getTemplatedDecl()->hasCXXExplicitFunctionObjectParameter())
+        HasExplicitParameter = true;
+
+      if (HasExplicitParameter)
+        ExplicitObjectType = GetExplicitObjectType(*this, UnresExpr);
 
       // Microsoft supports direct constructor calls.
       if (getLangOpts().MicrosoftExt && isa<CXXConstructorDecl>(Func)) {
@@ -14851,17 +15208,20 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
         if (TemplateArgs)
           continue;
 
-        AddMethodCandidate(Method, I.getPair(), ActingDC, ObjectType,
+        AddMethodCandidate(Method, I.getPair(), ActingDC, ExplicitObjectType,
                            ObjectClassification, Args, CandidateSet,
                            /*SuppressUserConversions=*/false);
       } else {
-        AddMethodTemplateCandidate(
-            cast<FunctionTemplateDecl>(Func), I.getPair(), ActingDC,
-            TemplateArgs, ObjectType, ObjectClassification, Args, CandidateSet,
-            /*SuppressUserConversions=*/false);
+        AddMethodTemplateCandidate(cast<FunctionTemplateDecl>(Func),
+                                   I.getPair(), ActingDC, TemplateArgs,
+                                   ExplicitObjectType, ObjectClassification,
+                                   Args, CandidateSet,
+                                   /*SuppressUserConversions=*/false);
       }
     }
 
+    HadMultipleCandidates = (CandidateSet.size() > 1);
+
     DeclarationName DeclName = UnresExpr->getMemberName();
 
     UnbridgedCasts.restore();
@@ -14915,10 +15275,14 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
     if (!Succeeded)
       return BuildRecoveryExpr(chooseRecoveryType(CandidateSet, &Best));
 
-    MemExprE = FixOverloadedFunctionReference(MemExprE, FoundDecl, Method);
+    ExprResult Res =
+        FixOverloadedFunctionReference(MemExprE, FoundDecl, Method);
+    if (Res.isInvalid())
+      return ExprError();
+    MemExprE = Res.get();
 
-    // If overload resolution picked a static member, build a
-    // non-member call based on that function.
+    // If overload resolution picked a static member
+    // build a non-member call based on that function.
     if (Method->isStatic()) {
       return BuildResolvedCallExpr(MemExprE, Method, LParenLoc, Args, RParenLoc,
                                    ExecConfig, IsExecConfig);
@@ -14933,27 +15297,41 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
 
   assert(Method && "Member call to something that isn't a method?");
   const auto *Proto = Method->getType()->castAs<FunctionProtoType>();
-  CXXMemberCallExpr *TheCall = CXXMemberCallExpr::Create(
-      Context, MemExprE, Args, ResultType, VK, RParenLoc,
-      CurFPFeatureOverrides(), Proto->getNumParams());
 
-  // Check for a valid return type.
-  if (CheckCallReturnType(Method->getReturnType(), MemExpr->getMemberLoc(),
-                          TheCall, Method))
-    return BuildRecoveryExpr(ResultType);
+  CallExpr *TheCall = nullptr;
+  llvm::SmallVector<Expr *, 8> NewArgs;
+  if (Method->isExplicitObjectMemberFunction()) {
+    PrepareExplicitObjectArgument(*this, Method, MemExpr->getBase(), Args,
+                                  NewArgs);
+    // Build the actual expression node.
+    ExprResult FnExpr =
+        CreateFunctionRefExpr(*this, Method, FoundDecl, MemExpr,
+                              HadMultipleCandidates, MemExpr->getExprLoc());
+    if (FnExpr.isInvalid())
+      return ExprError();
 
-  // Convert the object argument (for a non-static member function call).
-  // We only need to do this if there was actually an overload; otherwise
-  // it was done at lookup.
-  if (!Method->isStatic()) {
-    ExprResult ObjectArg =
-      PerformObjectArgumentInitialization(MemExpr->getBase(), Qualifier,
-                                          FoundDecl, Method);
+    TheCall =
+        CallExpr::Create(Context, FnExpr.get(), Args, ResultType, VK, RParenLoc,
+                         CurFPFeatureOverrides(), Proto->getNumParams());
+  } else {
+    // Convert the object argument (for a non-static member function call).
+    // We only need to do this if there was actually an overload; otherwise
+    // it was done at lookup.
+    ExprResult ObjectArg = PerformImplicitObjectArgumentInitialization(
+        MemExpr->getBase(), Qualifier, FoundDecl, Method);
     if (ObjectArg.isInvalid())
       return ExprError();
     MemExpr->setBase(ObjectArg.get());
+    TheCall = CXXMemberCallExpr::Create(Context, MemExprE, Args, ResultType, VK,
+                                        RParenLoc, CurFPFeatureOverrides(),
+                                        Proto->getNumParams());
   }
 
+  // Check for a valid return type.
+  if (CheckCallReturnType(Method->getReturnType(), MemExpr->getMemberLoc(),
+                          TheCall, Method))
+    return BuildRecoveryExpr(ResultType);
+
   // Convert the rest of the arguments
   if (ConvertArgumentsForCall(TheCall, MemExpr, Method, Proto, Args,
                               RParenLoc))
@@ -14980,10 +15358,9 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
     }
   }
 
-  if ((isa<CXXConstructorDecl>(CurContext) ||
-       isa<CXXDestructorDecl>(CurContext)) &&
-      TheCall->getMethodDecl()->isPure()) {
-    const CXXMethodDecl *MD = TheCall->getMethodDecl();
+  if (isa<CXXConstructorDecl, CXXDestructorDecl>(CurContext) &&
+      TheCall->getDirectCallee()->isPure()) {
+    const FunctionDecl *MD = TheCall->getDirectCallee();
 
     if (isa<CXXThisExpr>(MemExpr->getBase()->IgnoreParenCasts()) &&
         MemExpr->performsVirtualDispatch(getLangOpts())) {
@@ -14999,8 +15376,7 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
     }
   }
 
-  if (CXXDestructorDecl *DD =
-          dyn_cast<CXXDestructorDecl>(TheCall->getMethodDecl())) {
+  if (auto *DD = dyn_cast<CXXDestructorDecl>(TheCall->getDirectCallee())) {
     // a->A::f() doesn't go through the vtable, except in AppleKext mode.
     bool CallCanBeVirtual = !MemExpr->hasQualifier() || getLangOpts().AppleKext;
     CheckVirtualDtorCall(DD, MemExpr->getBeginLoc(), /*IsDelete=*/false,
@@ -15009,7 +15385,7 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
   }
 
   return CheckForImmediateInvocation(MaybeBindToTemporary(TheCall),
-                                     TheCall->getMethodDecl());
+                                     TheCall->getDirectCallee());
 }
 
 /// BuildCallToObjectOfClassType - Build a call to an object of class
@@ -15234,8 +15610,13 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
   // Initialize the implicit object parameter if needed.
   // Since C++23, this could also be a call to a static call operator
   // which we emit as a regular CallExpr.
-  if (Method->isInstance()) {
-    ExprResult ObjRes = PerformObjectArgumentInitialization(
+  llvm::SmallVector<Expr *, 8> NewArgs;
+  if (Method->isExplicitObjectMemberFunction()) {
+    // FIXME: we should do that during the definition of the lambda when we can.
+    DiagnoseInvalidExplicitObjectParameterInLambda(Method);
+    PrepareExplicitObjectArgument(*this, Method, Obj, Args, NewArgs);
+  } else if (Method->isInstance()) {
+    ExprResult ObjRes = PerformImplicitObjectArgumentInitialization(
         Object.get(), /*Qualifier=*/nullptr, Best->FoundDecl, Method);
     if (ObjRes.isInvalid())
       IsError = true;
@@ -15377,12 +15758,19 @@ Sema::BuildOverloadedArrowExpr(Scope *S, Expr *Base, SourceLocation OpLoc,
 
   // Convert the object parameter.
   CXXMethodDecl *Method = cast<CXXMethodDecl>(Best->Function);
-  ExprResult BaseResult =
-    PerformObjectArgumentInitialization(Base, /*Qualifier=*/nullptr,
-                                        Best->FoundDecl, Method);
-  if (BaseResult.isInvalid())
-    return ExprError();
-  Base = BaseResult.get();
+
+  if (Method->isExplicitObjectMemberFunction()) {
+    ExprResult R = InitializeExplicitObjectArgument(*this, Base, Method);
+    if (R.isInvalid())
+      return ExprError();
+    Base = R.get();
+  } else {
+    ExprResult BaseResult = PerformImplicitObjectArgumentInitialization(
+        Base, /*Qualifier=*/nullptr, Best->FoundDecl, Method);
+    if (BaseResult.isInvalid())
+      return ExprError();
+    Base = BaseResult.get();
+  }
 
   // Build the operator call.
   ExprResult FnExpr = CreateFunctionRefExpr(*this, Method, Best->FoundDecl,
@@ -15393,7 +15781,8 @@ Sema::BuildOverloadedArrowExpr(Scope *S, Expr *Base, SourceLocation OpLoc,
   QualType ResultTy = Method->getReturnType();
   ExprValueKind VK = Expr::getValueKindForType(ResultTy);
   ResultTy = ResultTy.getNonLValueExprType(Context);
-  CXXOperatorCallExpr *TheCall =
+
+  CallExpr *TheCall =
       CXXOperatorCallExpr::Create(Context, OO_Arrow, FnExpr.get(), Base,
                                   ResultTy, VK, OpLoc, CurFPFeatureOverrides());
 
@@ -15559,37 +15948,44 @@ Sema::BuildForRangeBeginEndCall(SourceLocation Loc,
 /// perhaps a '&' around it). We have resolved the overloaded function
 /// to the function declaration Fn, so patch up the expression E to
 /// refer (possibly indirectly) to Fn. Returns the new expr.
-Expr *Sema::FixOverloadedFunctionReference(Expr *E, DeclAccessPair Found,
-                                           FunctionDecl *Fn) {
+ExprResult Sema::FixOverloadedFunctionReference(Expr *E, DeclAccessPair Found,
+                                                FunctionDecl *Fn) {
   if (ParenExpr *PE = dyn_cast<ParenExpr>(E)) {
-    Expr *SubExpr = FixOverloadedFunctionReference(PE->getSubExpr(),
-                                                   Found, Fn);
-    if (SubExpr == PE->getSubExpr())
+    ExprResult SubExpr =
+        FixOverloadedFunctionReference(PE->getSubExpr(), Found, Fn);
+    if (SubExpr.isInvalid())
+      return ExprError();
+    if (SubExpr.get() == PE->getSubExpr())
       return PE;
 
-    return new (Context) ParenExpr(PE->getLParen(), PE->getRParen(), SubExpr);
+    return new (Context)
+        ParenExpr(PE->getLParen(), PE->getRParen(), SubExpr.get());
   }
 
   if (ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(E)) {
-    Expr *SubExpr = FixOverloadedFunctionReference(ICE->getSubExpr(),
-                                                   Found, Fn);
+    ExprResult SubExpr =
+        FixOverloadedFunctionReference(ICE->getSubExpr(), Found, Fn);
+    if (SubExpr.isInvalid())
+      return ExprError();
     assert(Context.hasSameType(ICE->getSubExpr()->getType(),
-                               SubExpr->getType()) &&
+                               SubExpr.get()->getType()) &&
            "Implicit cast type cannot be determined from overload");
     assert(ICE->path_empty() && "fixing up hierarchy conversion?");
-    if (SubExpr == ICE->getSubExpr())
+    if (SubExpr.get() == ICE->getSubExpr())
       return ICE;
 
     return ImplicitCastExpr::Create(Context, ICE->getType(), ICE->getCastKind(),
-                                    SubExpr, nullptr, ICE->getValueKind(),
+                                    SubExpr.get(), nullptr, ICE->getValueKind(),
                                     CurFPFeatureOverrides());
   }
 
   if (auto *GSE = dyn_cast<GenericSelectionExpr>(E)) {
     if (!GSE->isResultDependent()) {
-      Expr *SubExpr =
+      ExprResult SubExpr =
           FixOverloadedFunctionReference(GSE->getResultExpr(), Found, Fn);
-      if (SubExpr == GSE->getResultExpr())
+      if (SubExpr.isInvalid())
+        return ExprError();
+      if (SubExpr.get() == GSE->getResultExpr())
         return GSE;
 
       // Replace the resulting type information before rebuilding the generic
@@ -15597,7 +15993,7 @@ Expr *Sema::FixOverloadedFunctionReference(Expr *E, DeclAccessPair Found,
       ArrayRef<Expr *> A = GSE->getAssocExprs();
       SmallVector<Expr *, 4> AssocExprs(A.begin(), A.end());
       unsigned ResultIdx = GSE->getResultIndex();
-      AssocExprs[ResultIdx] = SubExpr;
+      AssocExprs[ResultIdx] = SubExpr.get();
 
       if (GSE->isExprPredicate())
         return GenericSelectionExpr::Create(
@@ -15627,15 +16023,21 @@ Expr *Sema::FixOverloadedFunctionReference(Expr *E, DeclAccessPair Found,
         // Fix the subexpression, which really has to be an
         // UnresolvedLookupExpr holding an overloaded member function
         // or template.
-        Expr *SubExpr = FixOverloadedFunctionReference(UnOp->getSubExpr(),
-                                                       Found, Fn);
-        if (SubExpr == UnOp->getSubExpr())
+        ExprResult SubExpr =
+            FixOverloadedFunctionReference(UnOp->getSubExpr(), Found, Fn);
+        if (SubExpr.isInvalid())
+          return ExprError();
+        if (SubExpr.get() == UnOp->getSubExpr())
           return UnOp;
 
-        assert(isa<DeclRefExpr>(SubExpr)
-               && "fixed to something other than a decl ref");
-        assert(cast<DeclRefExpr>(SubExpr)->getQualifier()
-               && "fixed to a member ref with no nested name qualifier");
+        if (CheckUseOfCXXMethodAsAddressOfOperand(UnOp->getBeginLoc(),
+                                                  SubExpr.get(), Method))
+          return ExprError();
+
+        assert(isa<DeclRefExpr>(SubExpr.get()) &&
+               "fixed to something other than a decl ref");
+        assert(cast<DeclRefExpr>(SubExpr.get())->getQualifier() &&
+               "fixed to a member ref with no nested name qualifier");
 
         // We have taken the address of a pointer to member
         // function. Perform the computation here so that we get the
@@ -15648,19 +16050,21 @@ Expr *Sema::FixOverloadedFunctionReference(Expr *E, DeclAccessPair Found,
         if (Context.getTargetInfo().getCXXABI().isMicrosoft())
           (void)isCompleteType(UnOp->getOperatorLoc(), MemPtrType);
 
-        return UnaryOperator::Create(
-            Context, SubExpr, UO_AddrOf, MemPtrType, VK_PRValue, OK_Ordinary,
-            UnOp->getOperatorLoc(), false, CurFPFeatureOverrides());
+        return UnaryOperator::Create(Context, SubExpr.get(), UO_AddrOf,
+                                     MemPtrType, VK_PRValue, OK_Ordinary,
+                                     UnOp->getOperatorLoc(), false,
+                                     CurFPFeatureOverrides());
       }
     }
-    Expr *SubExpr = FixOverloadedFunctionReference(UnOp->getSubExpr(),
-                                                   Found, Fn);
-    if (SubExpr == UnOp->getSubExpr())
+    ExprResult SubExpr =
+        FixOverloadedFunctionReference(UnOp->getSubExpr(), Found, Fn);
+    if (SubExpr.isInvalid())
+      return ExprError();
+    if (SubExpr.get() == UnOp->getSubExpr())
       return UnOp;
 
-    // FIXME: This can't currently fail, but in principle it could.
-    return CreateBuiltinUnaryOp(UnOp->getOperatorLoc(), UO_AddrOf, SubExpr)
-        .get();
+    return CreateBuiltinUnaryOp(UnOp->getOperatorLoc(), UO_AddrOf,
+                                SubExpr.get());
   }
 
   if (UnresolvedLookupExpr *ULE = dyn_cast<UnresolvedLookupExpr>(E)) {
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 10adfbc406dfbb..333226963aeac5 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -689,7 +689,7 @@ bool Sema::checkMustTailAttr(const Stmt *St, const Attr &MTA) {
     if (CMD->isStatic())
       Type.MemberType = FuncType::ft_static_member;
     else {
-      Type.This = CMD->getThisObjectType();
+      Type.This = CMD->getFunctionObjectParameterType();
       Type.MemberType = FuncType::ft_non_static_member;
     }
     Type.Func = CMD->getType()->castAs<FunctionProtoType>();
diff --git a/clang/lib/Sema/SemaStmtAsm.cpp b/clang/lib/Sema/SemaStmtAsm.cpp
index 2acb269f042399..83351b703c1536 100644
--- a/clang/lib/Sema/SemaStmtAsm.cpp
+++ b/clang/lib/Sema/SemaStmtAsm.cpp
@@ -271,7 +271,8 @@ StmtResult Sema::ActOnGCCAsmStmt(SourceLocation AsmLoc, bool IsSimple,
       OutputName = Names[i]->getName();
 
     TargetInfo::ConstraintInfo Info(Literal->getString(), OutputName);
-    if (!Context.getTargetInfo().validateOutputConstraint(Info)) {
+    if (!Context.getTargetInfo().validateOutputConstraint(Info) &&
+        !(LangOpts.HIPStdPar && LangOpts.CUDAIsDevice)) {
       targetDiag(Literal->getBeginLoc(),
                  diag::err_asm_invalid_output_constraint)
           << Info.getConstraintStr();
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 994d89e25b3848..080f005e04402c 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -753,7 +753,8 @@ Sema::ActOnDependentIdExpression(const CXXScopeSpec &SS,
     IsEnum = isa_and_nonnull<EnumType>(NNS->getAsType());
 
   if (!MightBeCxx11UnevalField && !isAddressOfOperand && !IsEnum &&
-      isa<CXXMethodDecl>(DC) && cast<CXXMethodDecl>(DC)->isInstance()) {
+      isa<CXXMethodDecl>(DC) &&
+      cast<CXXMethodDecl>(DC)->isImplicitObjectMemberFunction()) {
     QualType ThisType = cast<CXXMethodDecl>(DC)->getThisType().getNonReferenceType();
 
     // Since the 'this' expression is synthesized, we don't need to
@@ -7079,7 +7080,8 @@ CheckTemplateArgumentPointerToMember(Sema &S, NonTypeTemplateParmDecl *Param,
       isa<CXXMethodDecl>(DRE->getDecl())) {
     assert((isa<FieldDecl>(DRE->getDecl()) ||
             isa<IndirectFieldDecl>(DRE->getDecl()) ||
-            !cast<CXXMethodDecl>(DRE->getDecl())->isStatic()) &&
+            cast<CXXMethodDecl>(DRE->getDecl())
+                ->isImplicitObjectMemberFunction()) &&
            "Only non-static member pointers can make it here");
 
     // Okay: this is the address of a non-static member, and therefore
@@ -7631,7 +7633,10 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
         if (DiagnoseUseOfDecl(Fn, Arg->getBeginLoc()))
           return ExprError();
 
-        Arg = FixOverloadedFunctionReference(Arg, FoundResult, Fn);
+        ExprResult Res = FixOverloadedFunctionReference(Arg, FoundResult, Fn);
+        if (Res.isInvalid())
+          return ExprError();
+        Arg = Res.get();
         ArgType = Arg->getType();
       } else
         return ExprError();
@@ -7682,8 +7687,10 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
                                                                 FoundResult)) {
         if (DiagnoseUseOfDecl(Fn, Arg->getBeginLoc()))
           return ExprError();
-
-        Arg = FixOverloadedFunctionReference(Arg, FoundResult, Fn);
+        ExprResult Res = FixOverloadedFunctionReference(Arg, FoundResult, Fn);
+        if (Res.isInvalid())
+          return ExprError();
+        Arg = Res.get();
         ArgType = Arg->getType();
       } else
         return ExprError();
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 31ea7be2975e49..62fbd903a04044 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -1905,11 +1905,11 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
 
       QualType PPT = MPP->getPointeeType();
       if (PPT->isFunctionType())
-        S.adjustMemberFunctionCC(PPT, /*IsStatic=*/true,
+        S.adjustMemberFunctionCC(PPT, /*HasThisPointer=*/false,
                                  /*IsCtorOrDtor=*/false, Info.getLocation());
       QualType APT = MPA->getPointeeType();
       if (APT->isFunctionType())
-        S.adjustMemberFunctionCC(APT, /*IsStatic=*/true,
+        S.adjustMemberFunctionCC(APT, /*HasThisPointer=*/false,
                                  /*IsCtorOrDtor=*/false, Info.getLocation());
 
       unsigned SubTDF = TDF & TDF_IgnoreQualifiers;
@@ -3685,7 +3685,9 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       OriginalCallArg OriginalArg = (*OriginalCallArgs)[I];
 
       auto ParamIdx = OriginalArg.ArgIdx;
-      if (ParamIdx >= Specialization->getNumParams())
+      unsigned ExplicitOffset =
+          Specialization->hasCXXExplicitFunctionObjectParameter() ? 1 : 0;
+      if (ParamIdx >= Specialization->getNumParams() - ExplicitOffset)
         // FIXME: This presumably means a pack ended up smaller than we
         // expected while deducing. Should this not result in deduction
         // failure? Can it even happen?
@@ -3695,7 +3697,8 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       if (!OriginalArg.DecomposedParam) {
         // P is one of the function parameters, just look up its substituted
         // type.
-        DeducedA = Specialization->getParamDecl(ParamIdx)->getType();
+        DeducedA =
+            Specialization->getParamDecl(ParamIdx + ExplicitOffset)->getType();
       } else {
         // P is a decomposed element of a parameter corresponding to a
         // braced-init-list argument. Substitute back into P to find the
@@ -3745,7 +3748,7 @@ static QualType GetTypeOfFunction(Sema &S, const OverloadExpr::FindResult &R,
     return {};
 
   if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(Fn))
-    if (Method->isInstance()) {
+    if (Method->isImplicitObjectMemberFunction()) {
       // An instance method that's referenced in a form that doesn't
       // look like a member pointer is just invalid.
       if (!R.HasFormOfMemberPointer)
@@ -3874,7 +3877,8 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams,
 /// overloaded function set that could not be resolved.
 static bool AdjustFunctionParmAndArgTypesForDeduction(
     Sema &S, TemplateParameterList *TemplateParams, unsigned FirstInnerIndex,
-    QualType &ParamType, QualType &ArgType, Expr *Arg, unsigned &TDF,
+    QualType &ParamType, QualType &ArgType,
+    Expr::Classification ArgClassification, Expr *Arg, unsigned &TDF,
     TemplateSpecCandidateSet *FailedTSC = nullptr) {
   // C++0x [temp.deduct.call]p3:
   //   If P is a cv-qualified type, the top level cv-qualifiers of P's type
@@ -3892,6 +3896,7 @@ static bool AdjustFunctionParmAndArgTypesForDeduction(
   // but there are sometimes special circumstances.  Typically
   // involving a template-id-expr.
   if (ArgType == S.Context.OverloadTy) {
+    assert(Arg && "expected a non-null arg expression");
     ArgType = ResolveOverloadForDeduction(S, TemplateParams, Arg, ParamType,
                                           ParamRefType != nullptr, FailedTSC);
     if (ArgType.isNull())
@@ -3900,14 +3905,16 @@ static bool AdjustFunctionParmAndArgTypesForDeduction(
 
   if (ParamRefType) {
     // If the argument has incomplete array type, try to complete its type.
-    if (ArgType->isIncompleteArrayType())
+    if (ArgType->isIncompleteArrayType()) {
+      assert(Arg && "expected a non-null arg expression");
       ArgType = S.getCompletedType(Arg);
+    }
 
     // C++1z [temp.deduct.call]p3:
     //   If P is a forwarding reference and the argument is an lvalue, the type
     //   "lvalue reference to A" is used in place of A for type deduction.
     if (isForwardingReference(QualType(ParamRefType, 0), FirstInnerIndex) &&
-        Arg->isLValue()) {
+        ArgClassification.isLValue()) {
       if (S.getLangOpts().OpenCL && !ArgType.hasAddressSpace())
         ArgType = S.Context.getAddrSpaceQualType(
             ArgType, S.Context.getDefaultOpenCLPointeeAddrSpace());
@@ -3968,7 +3975,9 @@ hasDeducibleTemplateParameters(Sema &S, FunctionTemplateDecl *FunctionTemplate,
 
 static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
     Sema &S, TemplateParameterList *TemplateParams, unsigned FirstInnerIndex,
-    QualType ParamType, Expr *Arg, TemplateDeductionInfo &Info,
+    QualType ParamType, QualType ArgType,
+    Expr::Classification ArgClassification, Expr *Arg,
+    TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     SmallVectorImpl<Sema::OriginalCallArg> &OriginalCallArgs,
     bool DecomposedParam, unsigned ArgIdx, unsigned TDF,
@@ -4013,8 +4022,9 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
   if (ElTy->isDependentType()) {
     for (Expr *E : ILE->inits()) {
       if (auto Result = DeduceTemplateArgumentsFromCallArgument(
-              S, TemplateParams, 0, ElTy, E, Info, Deduced, OriginalCallArgs, true,
-              ArgIdx, TDF))
+              S, TemplateParams, 0, ElTy, E->getType(),
+              E->Classify(S.getASTContext()), E, Info, Deduced,
+              OriginalCallArgs, true, ArgIdx, TDF))
         return Result;
     }
   }
@@ -4045,23 +4055,25 @@ static Sema::TemplateDeductionResult DeduceFromInitializerList(
 ///        single parameter / argument pair.
 static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
     Sema &S, TemplateParameterList *TemplateParams, unsigned FirstInnerIndex,
-    QualType ParamType, Expr *Arg, TemplateDeductionInfo &Info,
+    QualType ParamType, QualType ArgType,
+    Expr::Classification ArgClassification, Expr *Arg,
+    TemplateDeductionInfo &Info,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     SmallVectorImpl<Sema::OriginalCallArg> &OriginalCallArgs,
     bool DecomposedParam, unsigned ArgIdx, unsigned TDF,
     TemplateSpecCandidateSet *FailedTSC) {
-  QualType ArgType = Arg->getType();
+
   QualType OrigParamType = ParamType;
 
   //   If P is a reference type [...]
   //   If P is a cv-qualified type [...]
-  if (AdjustFunctionParmAndArgTypesForDeduction(S, TemplateParams,
-                                                FirstInnerIndex, ParamType,
-                                                ArgType, Arg, TDF, FailedTSC))
+  if (AdjustFunctionParmAndArgTypesForDeduction(
+          S, TemplateParams, FirstInnerIndex, ParamType, ArgType,
+          ArgClassification, Arg, TDF, FailedTSC))
     return Sema::TDK_Success;
 
   //   If [...] the argument is a non-empty initializer list [...]
-  if (InitListExpr *ILE = dyn_cast<InitListExpr>(Arg))
+  if (InitListExpr *ILE = dyn_cast_if_present<InitListExpr>(Arg))
     return DeduceFromInitializerList(S, TemplateParams, ParamType, ILE, Info,
                                      Deduced, OriginalCallArgs, ArgIdx, TDF);
 
@@ -4070,8 +4082,9 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
   //
   // Keep track of the argument type and corresponding parameter index,
   // so we can check for compatibility between the deduced A and A.
-  OriginalCallArgs.push_back(
-      Sema::OriginalCallArg(OrigParamType, DecomposedParam, ArgIdx, ArgType));
+  if (Arg)
+    OriginalCallArgs.push_back(
+        Sema::OriginalCallArg(OrigParamType, DecomposedParam, ArgIdx, ArgType));
   return DeduceTemplateArgumentsByTypeMatch(S, TemplateParams, ParamType,
                                             ArgType, Info, Deduced, TDF);
 }
@@ -4106,12 +4119,19 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
     FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
     bool PartialOverloading, bool AggregateDeductionCandidate,
+    QualType ObjectType, Expr::Classification ObjectClassification,
     llvm::function_ref<bool(ArrayRef<QualType>)> CheckNonDependent) {
   if (FunctionTemplate->isInvalidDecl())
     return TDK_Invalid;
 
   FunctionDecl *Function = FunctionTemplate->getTemplatedDecl();
   unsigned NumParams = Function->getNumParams();
+  bool HasExplicitObject = false;
+  int ExplicitObjectOffset = 0;
+  if (Function->hasCXXExplicitFunctionObjectParameter()) {
+    HasExplicitObject = true;
+    ExplicitObjectOffset = 1;
+  }
 
   unsigned FirstInnerIndex = getFirstInnerIndex(FunctionTemplate);
 
@@ -4119,9 +4139,11 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   //   Template argument deduction is done by comparing each function template
   //   parameter type (call it P) with the type of the corresponding argument
   //   of the call (call it A) as described below.
-  if (Args.size() < Function->getMinRequiredArguments() && !PartialOverloading)
+  if (Args.size() < Function->getMinRequiredExplicitArguments() &&
+      !PartialOverloading)
     return TDK_TooFewArguments;
-  else if (TooManyArguments(NumParams, Args.size(), PartialOverloading)) {
+  else if (TooManyArguments(NumParams, Args.size() + ExplicitObjectOffset,
+                            PartialOverloading)) {
     const auto *Proto = Function->getType()->castAs<FunctionProtoType>();
     if (Proto->isTemplateVariadic())
       /* Do nothing */;
@@ -4157,7 +4179,8 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   SmallVector<OriginalCallArg, 8> OriginalCallArgs;
 
   // Deduce an argument of type ParamType from an expression with index ArgIdx.
-  auto DeduceCallArgument = [&](QualType ParamType, unsigned ArgIdx) {
+  auto DeduceCallArgument = [&](QualType ParamType, unsigned ArgIdx,
+                                bool ExplicitObjetArgument) {
     // C++ [demp.deduct.call]p1: (DR1391)
     //   Template argument deduction is done by comparing each function template
     //   parameter that contains template-parameters that participate in
@@ -4165,10 +4188,21 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     if (!hasDeducibleTemplateParameters(*this, FunctionTemplate, ParamType))
       return Sema::TDK_Success;
 
+    if (ExplicitObjetArgument) {
+      //   ... with the type of the corresponding argument
+      return DeduceTemplateArgumentsFromCallArgument(
+          *this, TemplateParams, FirstInnerIndex, ParamType, ObjectType,
+          ObjectClassification,
+          /*Arg=*/nullptr, Info, Deduced, OriginalCallArgs,
+          /*Decomposed*/ false, ArgIdx, /*TDF*/ 0);
+    }
+
     //   ... with the type of the corresponding argument
     return DeduceTemplateArgumentsFromCallArgument(
-        *this, TemplateParams, FirstInnerIndex, ParamType, Args[ArgIdx], Info, Deduced,
-        OriginalCallArgs, /*Decomposed*/false, ArgIdx, /*TDF*/ 0);
+        *this, TemplateParams, FirstInnerIndex, ParamType,
+        Args[ArgIdx]->getType(), Args[ArgIdx]->Classify(getASTContext()),
+        Args[ArgIdx], Info, Deduced, OriginalCallArgs, /*Decomposed*/ false,
+        ArgIdx, /*TDF*/ 0);
   };
 
   // Deduce template arguments from the function parameters.
@@ -4182,11 +4216,20 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
         dyn_cast<PackExpansionType>(ParamType);
     if (!ParamExpansion) {
       // Simple case: matching a function parameter to a function argument.
-      if (ArgIdx >= Args.size())
+      if (ArgIdx >= Args.size() && !(HasExplicitObject && ParamIdx == 0))
         break;
 
       ParamTypesForArgChecking.push_back(ParamType);
-      if (auto Result = DeduceCallArgument(ParamType, ArgIdx++))
+
+      if (ParamIdx == 0 && HasExplicitObject) {
+        if (auto Result = DeduceCallArgument(ParamType, 0,
+                                             /*ExplicitObjetArgument=*/true))
+          return Result;
+        continue;
+      }
+
+      if (auto Result = DeduceCallArgument(ParamType, ArgIdx++,
+                                           /*ExplicitObjetArgument=*/false))
         return Result;
 
       continue;
@@ -4219,7 +4262,8 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
       for (; ArgIdx < Args.size() && PackScope.hasNextElement();
            PackScope.nextPackElement(), ++ArgIdx) {
         ParamTypesForArgChecking.push_back(ParamPattern);
-        if (auto Result = DeduceCallArgument(ParamPattern, ArgIdx))
+        if (auto Result = DeduceCallArgument(ParamPattern, ArgIdx,
+                                             /*ExplicitObjetArgument=*/false))
           return Result;
       }
     } else {
@@ -4459,11 +4503,10 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 /// Deduce template arguments for a templated conversion
 /// function (C++ [temp.deduct.conv]) and, if successful, produce a
 /// conversion function template specialization.
-Sema::TemplateDeductionResult
-Sema::DeduceTemplateArguments(FunctionTemplateDecl *ConversionTemplate,
-                              QualType ToType,
-                              CXXConversionDecl *&Specialization,
-                              TemplateDeductionInfo &Info) {
+Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
+    FunctionTemplateDecl *ConversionTemplate, QualType ObjectType,
+    Expr::Classification ObjectClassification, QualType ToType,
+    CXXConversionDecl *&Specialization, TemplateDeductionInfo &Info) {
   if (ConversionTemplate->isInvalidDecl())
     return TDK_Invalid;
 
@@ -4558,6 +4601,19 @@ Sema::DeduceTemplateArguments(FunctionTemplateDecl *ConversionTemplate,
   if ((P->isPointerType() && A->isPointerType()) ||
       (P->isMemberPointerType() && A->isMemberPointerType()))
     TDF |= TDF_IgnoreQualifiers;
+
+  SmallVector<Sema::OriginalCallArg, 1> OriginalCallArgs;
+  if (ConversionGeneric->isExplicitObjectMemberFunction()) {
+    QualType ParamType = ConversionGeneric->getParamDecl(0)->getType();
+    if (TemplateDeductionResult Result =
+            DeduceTemplateArgumentsFromCallArgument(
+                *this, TemplateParams, getFirstInnerIndex(ConversionTemplate),
+                ParamType, ObjectType, ObjectClassification,
+                /*Arg=*/nullptr, Info, Deduced, OriginalCallArgs,
+                /*Decomposed*/ false, 0, /*TDF*/ 0))
+      return Result;
+  }
+
   if (TemplateDeductionResult Result
         = DeduceTemplateArgumentsByTypeMatch(*this, TemplateParams,
                                              P, A, Info, Deduced, TDF))
@@ -4570,7 +4626,8 @@ Sema::DeduceTemplateArguments(FunctionTemplateDecl *ConversionTemplate,
   TemplateDeductionResult Result;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
     Result = FinishTemplateArgumentDeduction(ConversionTemplate, Deduced, 0,
-                                             ConversionSpecialized, Info);
+                                             ConversionSpecialized, Info,
+                                             &OriginalCallArgs);
   });
   Specialization = cast_or_null<CXXConversionDecl>(ConversionSpecialized);
   return Result;
@@ -4785,9 +4842,25 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     return TDK_Success;
   }
 
+  // Make sure that we treat 'char[]' equaly as 'char*' in C23 mode.
+  auto *String = dyn_cast<StringLiteral>(Init);
+  if (getLangOpts().C23 && String && Type.getType()->isArrayType()) {
+    Diag(Type.getBeginLoc(), diag::ext_c23_auto_non_plain_identifier);
+    TypeLoc TL = TypeLoc(Init->getType(), Type.getOpaqueData());
+    Result = SubstituteDeducedTypeTransform(*this, DependentResult).Apply(TL);
+    assert(!Result.isNull() && "substituting DependentTy can't fail");
+    return TDK_Success;
+  }
+
+  // Emit a warning if 'auto*' is used in pedantic and in C23 mode.
+  if (getLangOpts().C23 && Type.getType()->isPointerType()) {
+    Diag(Type.getBeginLoc(), diag::ext_c23_auto_non_plain_identifier);
+  }
+
   auto *InitList = dyn_cast<InitListExpr>(Init);
   if (!getLangOpts().CPlusPlus && InitList) {
-    Diag(Init->getBeginLoc(), diag::err_auto_init_list_from_c);
+    Diag(Init->getBeginLoc(), diag::err_auto_init_list_from_c)
+        << (int)AT->getKeyword() << getLangOpts().C23;
     return TDK_AlreadyDiagnosed;
   }
 
@@ -4846,7 +4919,8 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
         if (isa<DesignatedInitExpr>(Init))
           return TDK_Invalid;
         if (auto TDK = DeduceTemplateArgumentsFromCallArgument(
-                *this, TemplateParamsSt.get(), 0, TemplArg, Init, Info, Deduced,
+                *this, TemplateParamsSt.get(), 0, TemplArg, Init->getType(),
+                Init->Classify(getASTContext()), Init, Info, Deduced,
                 OriginalCallArgs, /*Decomposed=*/true,
                 /*ArgIdx=*/0, /*TDF=*/0)) {
           if (TDK == TDK_Inconsistent) {
@@ -4872,7 +4946,8 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
       assert(!FuncParam.isNull() &&
              "substituting template parameter for 'auto' failed");
       if (auto TDK = DeduceTemplateArgumentsFromCallArgument(
-              *this, TemplateParamsSt.get(), 0, FuncParam, Init, Info, Deduced,
+              *this, TemplateParamsSt.get(), 0, FuncParam, Init->getType(),
+              Init->Classify(getASTContext()), Init, Info, Deduced,
               OriginalCallArgs, /*Decomposed=*/false, /*ArgIdx=*/0, /*TDF=*/0,
               FailedTSC))
         return DeductionFailed(TDK);
@@ -5080,6 +5155,8 @@ AddImplicitObjectParameterType(ASTContext &Context,
   //
   // The standard doesn't say explicitly, but we pick the appropriate kind of
   // reference type based on [over.match.funcs]p4.
+  assert(Method && Method->isImplicitObjectMemberFunction() &&
+         "expected an implicit objet function");
   QualType ArgTy = Context.getTypeDeclType(Method->getParent());
   ArgTy = Context.getQualifiedType(ArgTy, Method->getMethodQualifiers());
   if (Method->getRefQualifier() == RQ_RValue)
@@ -5141,14 +5218,17 @@ static bool isAtLeastAsSpecializedAs(Sema &S,
 
     unsigned NumComparedArguments = NumCallArguments1;
 
-    if (!Method2 && Method1 && !Method1->isStatic()) {
+    if (!Method2 && Method1 && Method1->isImplicitObjectMemberFunction()) {
       // Compare 'this' from Method1 against first parameter from Method2.
       AddImplicitObjectParameterType(S.Context, Method1, Args1);
       ++NumComparedArguments;
-    } else if (!Method1 && Method2 && !Method2->isStatic()) {
+    } else if (!Method1 && Method2 &&
+               Method2->isImplicitObjectMemberFunction()) {
       // Compare 'this' from Method2 against first parameter from Method1.
       AddImplicitObjectParameterType(S.Context, Method2, Args2);
-    } else if (Method1 && Method2 && Reversed) {
+    } else if (Method1 && Method2 && Reversed &&
+               Method1->isImplicitObjectMemberFunction() &&
+               Method2->isImplicitObjectMemberFunction()) {
       // Compare 'this' from Method1 against second parameter from Method2
       // and 'this' from Method2 against second parameter from Method1.
       AddImplicitObjectParameterType(S.Context, Method1, Args1);
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 00a36696cf9045..23de64080a070c 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1075,7 +1075,9 @@ void Sema::PrintInstantiationStack() {
           << Active->InstantiationRange;
       break;
     case CodeSynthesisContext::BuildingDeductionGuides:
-      llvm_unreachable("unexpected deduction guide in instantiation stack");
+      Diags.Report(Active->PointOfInstantiation,
+                   diag::note_building_deduction_guide_here);
+      break;
     }
   }
 }
@@ -2908,6 +2910,8 @@ ParmVarDecl *Sema::SubstParmVarDecl(
     NewParm->setUninstantiatedDefaultArg(Arg);
   }
 
+  NewParm->setExplicitObjectParameterLoc(
+      OldParm->getExplicitObjectParamThisLoc());
   NewParm->setHasInheritedDefaultArg(OldParm->hasInheritedDefaultArg());
 
   if (OldParm->isParameterPack() && !NewParm->isParameterPack()) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 1aa4036756f369..071366ac0ee983 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2140,7 +2140,7 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
     Function = CXXDeductionGuideDecl::Create(
         SemaRef.Context, DC, D->getInnerLocStart(),
         InstantiatedExplicitSpecifier, NameInfo, T, TInfo,
-        D->getSourceRange().getEnd(), /*Ctor=*/nullptr,
+        D->getSourceRange().getEnd(), DGuide->getCorrespondingConstructor(),
         DGuide->getDeductionCandidateKind());
     Function->setAccess(D->getAccess());
   } else {
@@ -4434,7 +4434,7 @@ TemplateDeclInstantiator::SubstFunctionType(FunctionDecl *D,
   Qualifiers ThisTypeQuals;
   if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D)) {
     ThisContext = cast<CXXRecordDecl>(Owner);
-    ThisTypeQuals = Method->getMethodQualifiers();
+    ThisTypeQuals = Method->getFunctionObjectParameterType().getQualifiers();
   }
 
   TypeSourceInfo *NewTInfo = SemaRef.SubstFunctionDeclType(
@@ -4523,6 +4523,36 @@ TemplateDeclInstantiator::SubstFunctionType(FunctionDecl *D,
   return NewTInfo;
 }
 
+/// Introduce the instantiated local variables into the local
+/// instantiation scope.
+void Sema::addInstantiatedLocalVarsToScope(FunctionDecl *Function,
+                                           const FunctionDecl *PatternDecl,
+                                           LocalInstantiationScope &Scope) {
+  LambdaScopeInfo *LSI = cast<LambdaScopeInfo>(getFunctionScopes().back());
+
+  for (auto *decl : PatternDecl->decls()) {
+    if (!isa<VarDecl>(decl) || isa<ParmVarDecl>(decl))
+      continue;
+
+    VarDecl *VD = cast<VarDecl>(decl);
+    IdentifierInfo *II = VD->getIdentifier();
+
+    auto it = llvm::find_if(Function->decls(), [&](Decl *inst) {
+      VarDecl *InstVD = dyn_cast<VarDecl>(inst);
+      return InstVD && InstVD->isLocalVarDecl() &&
+             InstVD->getIdentifier() == II;
+    });
+
+    if (it == Function->decls().end())
+      continue;
+
+    Scope.InstantiatedLocal(VD, *it);
+    LSI->addCapture(cast<VarDecl>(*it), /*isBlock=*/false, /*isByref=*/false,
+                    /*isNested=*/false, VD->getLocation(), SourceLocation(),
+                    VD->getType(), /*Invalid=*/false);
+  }
+}
+
 /// Introduce the instantiated function parameters into the local
 /// instantiation scope, and set the parameter names to those used
 /// in the template.
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index d13a5564e9ad64..068971f8130a4a 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/ASTStructuralEquivalence.h"
 #include "clang/AST/CXXInheritance.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
@@ -40,6 +41,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <bitset>
 #include <optional>
@@ -3122,7 +3124,7 @@ QualType Sema::BuildMemberPointerType(QualType T, QualType Class,
       (Entity.getNameKind() == DeclarationName::CXXConstructorName) ||
       (Entity.getNameKind() == DeclarationName::CXXDestructorName);
   if (T->isFunctionType())
-    adjustMemberFunctionCC(T, /*IsStatic=*/false, IsCtorOrDtor, Loc);
+    adjustMemberFunctionCC(T, /*HasThisPointer=*/true, IsCtorOrDtor, Loc);
 
   return Context.getMemberPointerType(T, Class.getTypePtr());
 }
@@ -5801,7 +5803,12 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
     //
     // Core issue 547 also allows cv-qualifiers on function types that are
     // top-level template type arguments.
-    enum { NonMember, Member, DeductionGuide } Kind = NonMember;
+    enum {
+      NonMember,
+      Member,
+      ExplicitObjectMember,
+      DeductionGuide
+    } Kind = NonMember;
     if (D.getName().getKind() == UnqualifiedIdKind::IK_DeductionGuideName)
       Kind = DeductionGuide;
     else if (!D.getCXXScopeSpec().isSet()) {
@@ -5815,6 +5822,18 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
         Kind = Member;
     }
 
+    if (Kind == Member) {
+      unsigned I;
+      if (D.isFunctionDeclarator(I)) {
+        const DeclaratorChunk &Chunk = D.getTypeObject(I);
+        if (Chunk.Fun.NumParams) {
+          auto *P = dyn_cast_or_null<ParmVarDecl>(Chunk.Fun.Params->Param);
+          if (P && P->isExplicitObjectParameter())
+            Kind = ExplicitObjectMember;
+        }
+      }
+    }
+
     // C++11 [dcl.fct]p6 (w/DR1417):
     // An attempt to specify a function type with a cv-qualifier-seq or a
     // ref-qualifier (including by typedef-name) is ill-formed unless it is:
@@ -5832,7 +5851,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
     //
     // ... for instance.
     if (IsQualifiedFunction &&
-        !(Kind == Member &&
+        !(Kind == Member && !D.isExplicitObjectMemberFunction() &&
           D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_static) &&
         !IsTypedefName && D.getContext() != DeclaratorContext::TemplateArg &&
         D.getContext() != DeclaratorContext::TemplateTypeArg) {
@@ -8123,14 +8142,15 @@ bool Sema::hasExplicitCallingConv(QualType T) {
   return false;
 }
 
-void Sema::adjustMemberFunctionCC(QualType &T, bool IsStatic, bool IsCtorOrDtor,
-                                  SourceLocation Loc) {
+void Sema::adjustMemberFunctionCC(QualType &T, bool HasThisPointer,
+                                  bool IsCtorOrDtor, SourceLocation Loc) {
   FunctionTypeUnwrapper Unwrapped(*this, T);
   const FunctionType *FT = Unwrapped.get();
   bool IsVariadic = (isa<FunctionProtoType>(FT) &&
                      cast<FunctionProtoType>(FT)->isVariadic());
   CallingConv CurCC = FT->getCallConv();
-  CallingConv ToCC = Context.getDefaultCallingConvention(IsVariadic, !IsStatic);
+  CallingConv ToCC =
+      Context.getDefaultCallingConvention(IsVariadic, HasThisPointer);
 
   if (CurCC == ToCC)
     return;
@@ -8150,7 +8170,7 @@ void Sema::adjustMemberFunctionCC(QualType &T, bool IsStatic, bool IsCtorOrDtor,
     // we should adjust a __cdecl type to __thiscall for instance methods, and a
     // __thiscall type to __cdecl for static methods.
     CallingConv DefaultCC =
-        Context.getDefaultCallingConvention(IsVariadic, IsStatic);
+        Context.getDefaultCallingConvention(IsVariadic, !HasThisPointer);
 
     if (CurCC != DefaultCC)
       return;
@@ -9860,11 +9880,14 @@ QualType Sema::BuildAtomicType(QualType T, SourceLocation Loc) {
       DisallowedKind = 5;
     else if (T->isSizelessType())
       DisallowedKind = 6;
-    else if (!T.isTriviallyCopyableType(Context))
+    else if (!T.isTriviallyCopyableType(Context) && getLangOpts().CPlusPlus)
       // Some other non-trivially-copyable type (probably a C++ class)
       DisallowedKind = 7;
     else if (T->isBitIntType())
       DisallowedKind = 8;
+    else if (getLangOpts().C23 && T->isUndeducedAutoType())
+      // _Atomic auto is prohibited in C23
+      DisallowedKind = 9;
 
     if (DisallowedKind != -1) {
       Diag(Loc, diag::err_atomic_specifier_bad_type) << DisallowedKind << T;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 0fc5ad8e3bde6c..8fafdd4f5caa1e 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2391,6 +2391,15 @@ class TreeTransform {
                                                  EndLoc);
   }
 
+  /// Build a new OpenMP 'ompx_bare' clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPXBareClause(SourceLocation StartLoc,
+                                   SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPXBareClause(StartLoc, EndLoc);
+  }
+
   /// Build a new OpenMP 'align' clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -5490,6 +5499,9 @@ TreeTransform<Derived>::TransformDependentSizedArrayType(TypeLocBuilder &TLB,
   EnterExpressionEvaluationContext Unevaluated(
       SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
 
+  // If we have a VLA then it won't be a constant.
+  SemaRef.ExprEvalContexts.back().InConditionallyConstantEvaluateContext = true;
+
   // Prefer the expression from the TypeLoc;  the other may have been uniqued.
   Expr *origSize = TL.getSizeExpr();
   if (!origSize) origSize = T->getSizeExpr();
@@ -10804,6 +10816,11 @@ TreeTransform<Derived>::TransformOMPXAttributeClause(OMPXAttributeClause *C) {
       NewAttrs, C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
 }
 
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPXBareClause(OMPXBareClause *C) {
+  return getDerived().RebuildOMPXBareClause(C->getBeginLoc(), C->getEndLoc());
+}
+
 //===----------------------------------------------------------------------===//
 // Expression transformation
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 0c9c5992c267b3..cdfd3b6e863a93 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -10447,6 +10447,9 @@ OMPClause *OMPClauseReader::readClause() {
   case llvm::omp::OMPC_ompx_attribute:
     C = new (Context) OMPXAttributeClause();
     break;
+  case llvm::omp::OMPC_ompx_bare:
+    C = new (Context) OMPXBareClause();
+    break;
 #define OMP_CLAUSE_NO_CLASS(Enum, Str)                                         \
   case llvm::omp::Enum:                                                        \
     break;
@@ -11548,6 +11551,8 @@ void OMPClauseReader::VisitOMPXAttributeClause(OMPXAttributeClause *C) {
   C->setLocEnd(Record.readSourceLocation());
 }
 
+void OMPClauseReader::VisitOMPXBareClause(OMPXBareClause *C) {}
+
 OMPTraitInfo *ASTRecordReader::readOMPTraitInfo() {
   OMPTraitInfo &TI = getContext().getNewOMPTraitInfo();
   TI.Sets.resize(readUInt32());
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 88cb54a0365e62..d553b3c6d78ded 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1707,6 +1707,7 @@ void ASTDeclReader::VisitParmVarDecl(ParmVarDecl *PD) {
   PD->ParmVarDeclBits.HasInheritedDefaultArg = Record.readInt();
   if (Record.readInt()) // hasUninstantiatedDefaultArg.
     PD->setUninstantiatedDefaultArg(Record.readExpr());
+  PD->ExplicitObjectParameterIntroducerLoc = Record.readSourceLocation();
 
   // FIXME: If this is a redeclaration of a function from another module, handle
   // inheritance of default arguments.
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 8edb04f4f81907..1bdc3fa3bea455 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -596,6 +596,7 @@ void ASTStmtReader::VisitDeclRefExpr(DeclRefExpr *E) {
   E->DeclRefExprBits.RefersToEnclosingVariableOrCapture = Record.readInt();
   E->DeclRefExprBits.NonOdrUseReason = Record.readInt();
   E->DeclRefExprBits.IsImmediateEscalating = Record.readInt();
+  E->DeclRefExprBits.CapturedByCopyInLambdaWithExplicitObjectParameter = false;
   unsigned NumTemplateArgs = 0;
   if (E->hasTemplateKWAndArgsInfo())
     NumTemplateArgs = Record.readInt();
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 201e2fcaaec91a..8d8a10336576d9 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7258,6 +7258,8 @@ void OMPClauseWriter::VisitOMPXAttributeClause(OMPXAttributeClause *C) {
   Record.AddSourceLocation(C->getEndLoc());
 }
 
+void OMPClauseWriter::VisitOMPXBareClause(OMPXBareClause *C) {}
+
 void ASTRecordWriter::writeOMPTraitInfo(const OMPTraitInfo *TI) {
   writeUInt32(TI->Sets.size());
   for (const auto &Set : TI->Sets) {
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 4426f5c22277ed..91c8ed9f75db1f 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1162,28 +1162,21 @@ void ASTDeclWriter::VisitParmVarDecl(ParmVarDecl *D) {
   Record.push_back(D->hasUninstantiatedDefaultArg());
   if (D->hasUninstantiatedDefaultArg())
     Record.AddStmt(D->getUninstantiatedDefaultArg());
+  Record.AddSourceLocation(D->getExplicitObjectParamThisLoc());
   Code = serialization::DECL_PARM_VAR;
 
   // If the assumptions about the DECL_PARM_VAR abbrev are true, use it.  Here
   // we dynamically check for the properties that we optimize for, but don't
   // know are true of all PARM_VAR_DECLs.
-  if (D->getDeclContext() == D->getLexicalDeclContext() &&
-      !D->hasAttrs() &&
-      !D->hasExtInfo() &&
-      !D->isImplicit() &&
-      !D->isUsed(false) &&
-      !D->isInvalidDecl() &&
-      !D->isReferenced() &&
-      D->getAccess() == AS_none &&
-      !D->isModulePrivate() &&
-      D->getStorageClass() == 0 &&
+  if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
+      !D->hasExtInfo() && !D->isImplicit() && !D->isUsed(false) &&
+      !D->isInvalidDecl() && !D->isReferenced() && D->getAccess() == AS_none &&
+      !D->isModulePrivate() && D->getStorageClass() == 0 &&
       D->getInitStyle() == VarDecl::CInit && // Can params have anything else?
-      D->getFunctionScopeDepth() == 0 &&
-      D->getObjCDeclQualifier() == 0 &&
-      !D->isKNRPromoted() &&
-      !D->hasInheritedDefaultArg() &&
-      D->getInit() == nullptr &&
-      !D->hasUninstantiatedDefaultArg())  // No default expr.
+      D->getFunctionScopeDepth() == 0 && D->getObjCDeclQualifier() == 0 &&
+      !D->isKNRPromoted() && !D->isExplicitObjectParameter() &&
+      !D->hasInheritedDefaultArg() && D->getInit() == nullptr &&
+      !D->hasUninstantiatedDefaultArg()) // No default expr.
     AbbrevToUse = Writer.getDeclParmVarAbbrev();
 
   // Check things we know are true of *every* PARM_VAR_DECL, which is more than
diff --git a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index 906f4e85a8e5b5..627b51af6bd44a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -899,6 +899,14 @@ void NullabilityChecker::checkPostCall(const CallEvent &Call,
   const NullabilityState *TrackedNullability =
       State->get<NullabilityMap>(Region);
 
+  // ObjCMessageExpr gets the actual type through
+  // Sema::getMessageSendResultType, instead of using the return type of
+  // MethodDecl directly. The final type is generated by considering the
+  // nullability of receiver and MethodDecl together. Thus, The type of
+  // ObjCMessageExpr is prefer.
+  if (const Expr *E = Call.getOriginExpr())
+    ReturnType = E->getType();
+
   if (!TrackedNullability &&
       getNullabilityAnnotation(ReturnType) == Nullability::Nullable) {
     State = State->set<NullabilityMap>(Region, Nullability::Nullable);
@@ -1053,7 +1061,7 @@ void NullabilityChecker::checkPostObjCMessage(const ObjCMethodCall &M,
   }
 
   // No tracked information. Use static type information for return value.
-  Nullability RetNullability = getNullabilityAnnotation(RetType);
+  Nullability RetNullability = getNullabilityAnnotation(Message->getType());
 
   // Properties might be computed, which means the property value could
   // theoretically change between calls even in commonly-observed cases like
diff --git a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
index e8d74b40c6fd84..5c10e757244d7f 100644
--- a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
@@ -272,7 +272,7 @@ BasicValueFactory::evalAPSInt(BinaryOperator::Opcode Op,
       // FIXME: This logic should probably go higher up, where we can
       // test these conditions symbolically.
 
-      if (V2.isSigned() && V2.isNegative())
+      if (V2.isNegative() || V2.getBitWidth() > 64)
         return nullptr;
 
       uint64_t Amt = V2.getZExtValue();
@@ -287,7 +287,7 @@ BasicValueFactory::evalAPSInt(BinaryOperator::Opcode Op,
       // FIXME: This logic should probably go higher up, where we can
       // test these conditions symbolically.
 
-      if (V2.isSigned() && V2.isNegative())
+      if (V2.isNegative() || V2.getBitWidth() > 64)
         return nullptr;
 
       uint64_t Amt = V2.getZExtValue();
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index d7c5bd1d404235..451ee91b94533d 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -299,7 +299,7 @@ ProgramStateRef ExprEngine::getInitialState(const LocationContext *InitLoc) {
   }
 
   if (const auto *MD = dyn_cast<CXXMethodDecl>(D)) {
-    if (!MD->isStatic()) {
+    if (MD->isImplicitObjectMemberFunction()) {
       // Precondition: 'this' is always non-null upon entry to the
       // top-level function.  This is our starting assumption for
       // analyzing an "open" program.
@@ -2114,7 +2114,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       // valid region.
       const Decl *Callee = OCE->getCalleeDecl();
       if (const auto *MD = dyn_cast_or_null<CXXMethodDecl>(Callee)) {
-        if (MD->isInstance()) {
+        if (MD->isImplicitObjectMemberFunction()) {
           ProgramStateRef State = Pred->getState();
           const LocationContext *LCtx = Pred->getLocationContext();
           ProgramStateRef NewState =
@@ -3357,7 +3357,7 @@ void ExprEngine::VisitMemberExpr(const MemberExpr *M, ExplodedNode *Pred,
 
       // Handle C++ method calls.
       if (const auto *MD = dyn_cast<CXXMethodDecl>(Member)) {
-        if (MD->isInstance())
+        if (MD->isImplicitObjectMemberFunction())
           state = createTemporaryRegionIfNeeded(state, LCtx, BaseExpr);
 
         SVal MDVal = svalBuilder.getFunctionPointer(MD);
diff --git a/clang/lib/StaticAnalyzer/Core/LoopWidening.cpp b/clang/lib/StaticAnalyzer/Core/LoopWidening.cpp
index 748c65f578a81a..a3b29ff487e4ed 100644
--- a/clang/lib/StaticAnalyzer/Core/LoopWidening.cpp
+++ b/clang/lib/StaticAnalyzer/Core/LoopWidening.cpp
@@ -84,7 +84,7 @@ ProgramStateRef getWidenedLoopState(ProgramStateRef PrevState,
   // pointer should remain unchanged.  Ignore static methods, since they do not
   // have 'this' pointers.
   const CXXMethodDecl *CXXMD = dyn_cast<CXXMethodDecl>(STC->getDecl());
-  if (CXXMD && !CXXMD->isStatic()) {
+  if (CXXMD && CXXMD->isImplicitObjectMemberFunction()) {
     const CXXThisRegion *ThisR =
         MRMgr.getCXXThisRegion(CXXMD->getThisType(), STC);
     ITraits.setTrait(ThisR,
diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
index f827f43eaa7da6..d89d82626f7269 100644
--- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -275,7 +275,7 @@ DefinedSVal SValBuilder::getMemberPointer(const NamedDecl *ND) {
     // We don't need to play a similar trick for static member fields
     // because these are represented as plain VarDecls and not FieldDecls
     // in the AST.
-    if (MD->isStatic())
+    if (!MD->isImplicitObjectMemberFunction())
       return getFunctionPointer(MD);
   }
 
diff --git a/clang/lib/Tooling/Core/Replacement.cpp b/clang/lib/Tooling/Core/Replacement.cpp
index 2c472df086d5ec..269f17a6db4cfc 100644
--- a/clang/lib/Tooling/Core/Replacement.cpp
+++ b/clang/lib/Tooling/Core/Replacement.cpp
@@ -67,7 +67,7 @@ bool Replacement::isApplicable() const {
 
 bool Replacement::apply(Rewriter &Rewrite) const {
   SourceManager &SM = Rewrite.getSourceMgr();
-  auto Entry = SM.getFileManager().getFile(FilePath);
+  auto Entry = SM.getFileManager().getOptionalFileRef(FilePath);
   if (!Entry)
     return false;
 
diff --git a/clang/lib/Tooling/Refactoring.cpp b/clang/lib/Tooling/Refactoring.cpp
index d45cd8c57f10e9..961fc1c1801547 100644
--- a/clang/lib/Tooling/Refactoring.cpp
+++ b/clang/lib/Tooling/Refactoring.cpp
@@ -78,10 +78,7 @@ bool formatAndApplyAllReplacements(
     const std::string &FilePath = FileAndReplaces.first;
     auto &CurReplaces = FileAndReplaces.second;
 
-    const FileEntry *Entry = nullptr;
-    if (auto File = Files.getFile(FilePath))
-      Entry = *File;
-
+    FileEntryRef Entry = llvm::cantFail(Files.getFileRef(FilePath));
     FileID ID = SM.getOrCreateFileID(Entry, SrcMgr::C_User);
     StringRef Code = SM.getBufferData(ID);
 
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index cd4ad010af1222..65361d67d68d57 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -260,3 +260,25 @@ namespace SourceLocation {
     static_assert(c.a.n == __LINE__ - 1, "");
   }
 }
+
+namespace popcount {
+  static_assert(__builtin_popcount(~0u) == __CHAR_BIT__ * sizeof(unsigned int), "");
+  static_assert(__builtin_popcount(0) == 0, "");
+  static_assert(__builtin_popcountl(~0ul) == __CHAR_BIT__ * sizeof(unsigned long), "");
+  static_assert(__builtin_popcountl(0) == 0, "");
+  static_assert(__builtin_popcountll(~0ull) == __CHAR_BIT__ * sizeof(unsigned long long), "");
+  static_assert(__builtin_popcountll(0) == 0, "");
+
+  /// From test/Sema/constant-builtins-2.c
+#define BITSIZE(x) (sizeof(x) * 8)
+  char popcount1[__builtin_popcount(0) == 0 ? 1 : -1];
+  char popcount2[__builtin_popcount(0xF0F0) == 8 ? 1 : -1];
+  char popcount3[__builtin_popcount(~0) == BITSIZE(int) ? 1 : -1];
+  char popcount4[__builtin_popcount(~0L) == BITSIZE(int) ? 1 : -1];
+  char popcount5[__builtin_popcountl(0L) == 0 ? 1 : -1];
+  char popcount6[__builtin_popcountl(0xF0F0L) == 8 ? 1 : -1];
+  char popcount7[__builtin_popcountl(~0L) == BITSIZE(long) ? 1 : -1];
+  char popcount8[__builtin_popcountll(0LL) == 0 ? 1 : -1];
+  char popcount9[__builtin_popcountll(0xF0F0LL) == 8 ? 1 : -1];
+  char popcount10[__builtin_popcountll(~0LL) == BITSIZE(long long) ? 1 : -1];
+}
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index c0ec5f8339dd1d..47058b8a93203b 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -14,11 +14,15 @@ _Static_assert(!!1.0, ""); // pedantic-ref-warning {{not an integer constant exp
                            // pedantic-expected-warning {{not an integer constant expression}}
 _Static_assert(!!1, "");
 
-/// FIXME: Should also be rejected in the new interpreter
-int a = (1 == 1 ? 5 : 3);
+int a = (1 == 1 ? 5 : 3); // expected-note {{declared here}} \
+                          // pedantic-expected-note {{declared here}}
 _Static_assert(a == 5, ""); // ref-error {{not an integral constant expression}} \
                             // pedantic-ref-error {{not an integral constant expression}} \
-                            // pedantic-expected-warning {{not an integer constant expression}}
+                            // expected-error {{not an integral constant expression}} \
+                            // expected-note {{read of non-const variable}} \
+                            // pedantic-expected-error {{not an integral constant expression}} \
+                            // pedantic-expected-note {{read of non-const variable}}
+
 
 const int b = 3;
 _Static_assert(b == 3, ""); // pedantic-ref-warning {{not an integer constant expression}} \
diff --git a/clang/test/AST/Interp/constexpr-subobj-initialization.cpp b/clang/test/AST/Interp/constexpr-subobj-initialization.cpp
new file mode 100644
index 00000000000000..4976b165468bd6
--- /dev/null
+++ b/clang/test/AST/Interp/constexpr-subobj-initialization.cpp
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
+
+/// This is like the version in test/SemaCXX/, but some of the
+/// output types and their location has been adapted.
+/// Differences:
+///   1) The type of the uninitialized base class is printed WITH the namespace,
+///      i.e. 'baseclass_uninit::DelBase' instead of just 'DelBase'.
+///   2) The location is not the base specifier declaration, but the call site
+///      of the constructor.
+
+
+namespace baseclass_uninit {
+struct DelBase {
+  constexpr DelBase() = delete; // expected-note {{'DelBase' has been explicitly marked deleted here}}
+};
+
+struct Foo : DelBase {
+  constexpr Foo() {}; // expected-error {{call to deleted constructor of 'DelBase'}}
+};
+constexpr Foo f; // expected-error {{must be initialized by a constant expression}} \
+                 // expected-note {{constructor of base class 'baseclass_uninit::DelBase' is not called}}
+
+struct Bar : Foo {
+  constexpr Bar() {};
+};
+constexpr Bar bar; // expected-error {{must be initialized by a constant expression}} \
+                   // expected-note {{constructor of base class 'baseclass_uninit::DelBase' is not called}}
+
+struct Base {};
+struct A : Base {
+  constexpr A() : value() {} // expected-error {{member initializer 'value' does not name a non-static data member or base class}}
+};
+
+constexpr A a; // expected-error {{must be initialized by a constant expression}} \
+               // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
+
+
+struct B : Base {
+  constexpr B() : {} // expected-error {{expected class member or base class name}}
+};
+
+constexpr B b; // expected-error {{must be initialized by a constant expression}} \
+               // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
+} // namespace baseclass_uninit
+
+
+struct Foo {
+  constexpr Foo(); // expected-note 2{{declared here}}
+};
+
+constexpr Foo ff; // expected-error {{must be initialized by a constant expression}} \
+                  // expected-note {{undefined constructor 'Foo' cannot be used in a constant expression}}
+
+struct Bar : protected Foo {
+  int i;
+  constexpr Bar() : i(12) {} // expected-note {{undefined constructor 'Foo' cannot be used in a constant expression}}
+};
+
+constexpr Bar bb; // expected-error {{must be initialized by a constant expression}} \
+                  // expected-note {{in call to 'Bar()'}}
+
+template <typename Ty>
+struct Baz {
+  constexpr Baz(); // expected-note {{declared here}}
+};
+
+struct Quux : Baz<Foo>, private Bar {
+  int i;
+  constexpr Quux() : i(12) {} // expected-note {{undefined constructor 'Baz' cannot be used in a constant expression}}
+};
+
+constexpr Quux qx; // expected-error {{must be initialized by a constant expression}} \
+                   // expected-note {{in call to 'Quux()'}}
diff --git a/clang/test/AST/Interp/lambda.cpp b/clang/test/AST/Interp/lambda.cpp
index da1d706af1d050..f8400898acc0c0 100644
--- a/clang/test/AST/Interp/lambda.cpp
+++ b/clang/test/AST/Interp/lambda.cpp
@@ -179,3 +179,24 @@ namespace LambdasAsParams {
   }
   static_assert(heh() == 1.0);
 }
+
+namespace ThisCapture {
+  class Foo {
+  public:
+    int b = 32;
+    int a;
+
+    constexpr Foo() : a([this](){ return b + 1;}()) {}
+
+    constexpr int Aplus2() const {
+      auto F = [this]() {
+        return a + 2;
+      };
+
+      return F();
+    }
+  };
+  constexpr Foo F;
+  static_assert(F.a == 33, "");
+  static_assert(F.Aplus2() == (33 + 2), "");
+}
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 0dd036353fa7ef..00875bcf44dc8e 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -7,8 +7,6 @@
 #define INT_MAX __INT_MAX__
 
 typedef __INTPTR_TYPE__ intptr_t;
-typedef __int128 int128_t;
-typedef unsigned __int128 uint128_t;
 
 
 static_assert(true, "");
@@ -29,7 +27,10 @@ static_assert(number != 10, ""); // expected-error{{failed}} \
                                  // ref-note{{evaluates to}}
 
 
+#ifdef __SIZEOF_INT128__
 namespace i128 {
+  typedef __int128 int128_t;
+  typedef unsigned __int128 uint128_t;
   constexpr int128_t I128_1 = 12;
   static_assert(I128_1 == 12, "");
   static_assert(I128_1 != 10, "");
@@ -51,6 +52,9 @@ namespace i128 {
   constexpr int128_t Two = (int128_t)1 << 1ul;
   static_assert(Two == 2, "");
 
+  constexpr uint128_t AllOnes = ~static_cast<uint128_t>(0);
+  static_assert(AllOnes == UINT128_MAX, "");
+
 #if __cplusplus >= 201402L
   template <typename T>
   constexpr T CastFrom(__int128_t A) {
@@ -66,6 +70,14 @@ namespace i128 {
   static_assert(CastFrom<double>(12) == 12, "");
   static_assert(CastFrom<long double>(12) == 12, "");
 
+  static_assert(CastFrom<char>(AllOnes) == -1, "");
+  static_assert(CastFrom<unsigned char>(AllOnes) == 0xFF, "");
+  static_assert(CastFrom<long>(AllOnes) == -1, "");
+  static_assert(CastFrom<unsigned short>(AllOnes) == 0xFFFF, "");
+  static_assert(CastFrom<int>(AllOnes) == -1, "");
+  static_assert(CastFrom<int128_t>(AllOnes) == -1, "");
+  static_assert(CastFrom<uint128_t>(AllOnes) == AllOnes, "");
+
   template <typename T>
   constexpr __int128 CastTo(T A) {
     int128_t B = (int128_t)A;
@@ -87,6 +99,7 @@ constexpr int128_t Error = __LDBL_MAX__; // ref-warning {{implicit conversion of
                                          // expected-error {{must be initialized by a constant expression}} \
                                          // expected-note {{is outside the range of representable values of type}}
 }
+#endif
 
 constexpr bool b = number;
 static_assert(b, "");
diff --git a/clang/test/AST/ast-dump-udl-consteval.cpp b/clang/test/AST/ast-dump-udl-consteval.cpp
new file mode 100644
index 00000000000000..9da53f725172ab
--- /dev/null
+++ b/clang/test/AST/ast-dump-udl-consteval.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -xc++ -std=c++23 -ast-dump %s | FileCheck %s
+
+int inline consteval operator""_u32(unsigned long long val) {
+  return val;
+}
+
+void udl() {
+  (void)(0_u32 + 1_u32);
+}
+
+// CHECK: `-BinaryOperator {{.+}} <col:10, col:18> 'int' '+'
+// CHECK-NEXT: |-ConstantExpr {{.+}} <col:10> 'int'
+// CHECK-NEXT: | |-value: Int 0
+// CHECK-NEXT: | `-UserDefinedLiteral {{.+}} <col:10> 'int'
+// CHECK: `-ConstantExpr {{.+}} <col:18> 'int'
+// CHECK-NEXT:   |-value: Int 1
+// CHECK-NEXT:   `-UserDefinedLiteral {{.+}} <col:18> 'int'
diff --git a/clang/test/Analysis/cxx2b-deducing-this.cpp b/clang/test/Analysis/cxx2b-deducing-this.cpp
new file mode 100644
index 00000000000000..d22a897097bec0
--- /dev/null
+++ b/clang/test/Analysis/cxx2b-deducing-this.cpp
@@ -0,0 +1,62 @@
+// RUN: %clang_analyze_cc1 -std=c++2b -verify %s \
+// RUN:   -analyzer-checker=core,debug.ExprInspection
+
+template <typename T> void clang_analyzer_dump(T);
+
+struct S {
+  int num;
+  S *orig;
+
+  void a(this auto Self) {
+    clang_analyzer_dump(&Self);     // expected-warning {{&Self}}
+    clang_analyzer_dump(Self.orig); // expected-warning {{&s}}
+    clang_analyzer_dump(Self.num);       // expected-warning {{5 S32b}}
+    clang_analyzer_dump(Self.orig->num); // expected-warning {{5 S32b}}
+
+    Self.num = 1;
+    clang_analyzer_dump(Self.num);       // expected-warning {{1 S32b}}
+    clang_analyzer_dump(Self.orig->num); // expected-warning {{5 S32b}}
+  }
+
+  void b(this auto& Self) {
+    clang_analyzer_dump(&Self);     // expected-warning {{&s}}
+    clang_analyzer_dump(Self.orig); // expected-warning {{&s}}
+    clang_analyzer_dump(Self.num);       // expected-warning {{5 S32b}}
+    clang_analyzer_dump(Self.orig->num); // expected-warning {{5 S32b}}
+
+    Self.num = 2;
+    clang_analyzer_dump(Self.num);       // expected-warning {{2 S32b}}
+    clang_analyzer_dump(Self.orig->num); // expected-warning {{2 S32b}}
+  }
+
+  void c(this S Self) {
+    clang_analyzer_dump(&Self);     // expected-warning {{&Self}}
+    clang_analyzer_dump(Self.orig); // expected-warning {{&s}}
+    clang_analyzer_dump(Self.num);       // expected-warning {{2 S32b}}
+    clang_analyzer_dump(Self.orig->num); // expected-warning {{2 S32b}}
+
+    Self.num = 3;
+    clang_analyzer_dump(Self.num);       // expected-warning {{3 S32b}}
+    clang_analyzer_dump(Self.orig->num); // expected-warning {{2 S32b}}
+  }
+
+  void c(this S Self, int I) {
+    clang_analyzer_dump(I); // expected-warning {{11 S32b}}
+    clang_analyzer_dump(&Self);     // expected-warning {{&Self}}
+    clang_analyzer_dump(Self.orig); // expected-warning {{&s}}
+    clang_analyzer_dump(Self.num);       // expected-warning {{2 S32b}}
+    clang_analyzer_dump(Self.orig->num); // expected-warning {{2 S32b}}
+
+    Self.num = 4;
+    clang_analyzer_dump(Self.num);       // expected-warning {{4 S32b}}
+    clang_analyzer_dump(Self.orig->num); // expected-warning {{2 S32b}}
+  }
+};
+
+void top() {
+  S s = {/*num=*/5, /*orig=*/&s};
+  s.a();
+  s.b(); // This call changes 's.num' to 2.
+  s.c();
+  s.c(11);
+}
diff --git a/clang/test/Analysis/int128-nocrash.c b/clang/test/Analysis/int128-nocrash.c
new file mode 100644
index 00000000000000..457254ce50caf0
--- /dev/null
+++ b/clang/test/Analysis/int128-nocrash.c
@@ -0,0 +1,15 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=optin.portability.UnixAPI \
+// RUN:    -triple x86_64-pc-linux-gnu -x c %s
+
+// Don't crash!
+// expected-no-diagnostics
+const __int128_t a = ( (__int128_t)1 << 64 );
+const _BitInt(72) b = ( 1 << 72 );
+
+void int128() {
+  2 >> a;
+}
+
+void withbitint() {
+  2 >> b;
+}
diff --git a/clang/test/Analysis/nullability.mm b/clang/test/Analysis/nullability.mm
index 06bb9912296e32..d69116d03df746 100644
--- a/clang/test/Analysis/nullability.mm
+++ b/clang/test/Analysis/nullability.mm
@@ -55,6 +55,7 @@ - (void)takesUnspecified:(int *)p;
 @property(readonly, nullable) void (^propReturnsNullableBlock)(void);
 @property(readonly, nullable) int *propReturnsNullable;
 @property(readonly) int *propReturnsUnspecified;
++ (nullable TestObject *)getNullableObject;
 @end
 
 TestObject * getUnspecifiedTestObject();
@@ -256,6 +257,12 @@ void testObjCPropertyReadNullability() {
   case 8:
     [o takesNonnullBlock:o.propReturnsNullableBlock]; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
     break;
+  case 9:
+    [o takesNonnull:getNullableTestObject().propReturnsNonnull]; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
+    break;
+  case 10:
+    [o takesNonnull:[TestObject getNullableObject].propReturnsNonnull]; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
+    break;
   }
 }
 
diff --git a/clang/test/C/C2x/n3007.c b/clang/test/C/C2x/n3007.c
new file mode 100644
index 00000000000000..1fd20332ceb471
--- /dev/null
+++ b/clang/test/C/C2x/n3007.c
@@ -0,0 +1,186 @@
+// RUN: %clang_cc1 -std=c2x -verify -pedantic -Wno-comments %s
+
+/* WG14 N3007: Yes
+ * Type Inference for object definitions
+ */
+void test_qualifiers(int x, const int y, int * restrict z) {
+  const auto a = x;
+  auto b = y;
+  static auto c = 1UL;
+  int* pa = &a; // expected-warning {{initializing 'int *' with an expression of type 'const int *' discards qualifiers}}
+  const int* pb = &b;
+  int* pc = &c; // expected-warning {{incompatible pointer types initializing 'int *' with an expression of type 'unsigned long *'}}
+
+  const int ci = 12;
+  auto yup = ci;
+  yup = 12;
+
+  auto r_test = z;
+
+  _Static_assert(_Generic(a, int : 1));
+  _Static_assert(_Generic(c, unsigned long : 1));
+  _Static_assert(_Generic(pa, int * : 1));
+  _Static_assert(_Generic(pb, const int * : 1));
+  _Static_assert(_Generic(r_test, int * : 1));
+}
+
+void test_atomic(void) {
+  _Atomic auto i = 12;  // expected-error {{_Atomic cannot be applied to type 'auto' in C23}}
+  _Atomic(auto) j = 12; // expected-error {{'auto' not allowed here}} \
+                           expected-error {{a type specifier is required for all declarations}}
+
+  _Atomic(int) foo(void);
+  auto k = foo();
+
+  _Static_assert(_Generic(&i, _Atomic auto *: 1)); // expected-error {{_Atomic cannot be applied to type 'auto' in C23}} \
+                                                      expected-error {{'auto' not allowed here}}
+  _Static_assert(_Generic(k, int: 1));
+}
+
+void test_double(void) {
+  double A[3] = { 0 };
+  auto pA = A;
+  auto qA = &A;
+  auto pi = 3.14;
+
+  _Static_assert(_Generic(A, double * : 1));
+  _Static_assert(_Generic(pA, double * : 1));
+  _Static_assert(_Generic(qA, double (*)[3] : 1));
+  _Static_assert(_Generic(pi, double : 1));
+}
+
+int test_auto_param(auto a) { // expected-error {{'auto' not allowed in function prototype}}
+  return (int)(a * 2);
+}
+
+auto test_auto_return(float a, int b) { // expected-error {{'auto' not allowed in function return type}}
+  return ((a * b) * (a / b));
+}
+
+[[clang::overloadable]] auto test(auto x) { // expected-error {{'auto' not allowed in function prototype}} \
+                                               expected-error {{'auto' not allowed in function return type}}
+  return x;
+}
+
+void test_sizeof_alignas(void) {
+  (void)sizeof(auto);       // expected-error {{expected expression}}
+  _Alignas(auto) int a[4];  // expected-error {{expected expression}}
+}
+
+void test_arrary(void) {
+  auto a[4];          // expected-error {{'auto' not allowed in array declaration}}
+  auto b[] = {1, 2};  // expected-error {{cannot use 'auto' with array in C}}
+}
+
+void test_initializer_list(void) {
+  auto a = {};        // expected-error {{cannot use 'auto' with array in C}}
+  auto b = { 0 };     // expected-error {{cannot use 'auto' with array in C}}
+  auto c = { 1, };    // expected-error {{cannot use 'auto' with array in C}}
+  auto d = { 1 , 2 }; // expected-error {{cannot use 'auto' with array in C}}
+  auto e = (int [3]){ 1, 2, 3 };
+}
+
+void test_structs(void) {
+  // FIXME: Both of these should be diagnosed as invalid underspecified
+  // declarations as described in N3006.
+  auto p1 = (struct { int a; } *)0;
+  struct s;
+  auto p2 = (struct s { int a; } *)0;
+
+  struct B { auto b; };   // expected-error {{'auto' not allowed in struct member}}
+}
+
+void test_typedefs(void) {
+  typedef auto auto_type;   // expected-error {{'auto' not allowed in typedef}}
+
+  typedef auto (*fp)(void); // expected-error {{'auto' not allowed in typedef}}
+  typedef void (*fp)(auto); // expected-error {{'auto' not allowed in function prototype}}
+
+  _Generic(0, auto : 1);    // expected-error {{'auto' not allowed here}}
+}
+
+void test_misc(void) {
+  auto something;                           // expected-error {{declaration of variable 'something' with deduced type 'auto' requires an initializer}}
+  auto test_char = 'A';
+  auto test_char_ptr = "test";
+  auto test_char_ptr2[] = "another test";   // expected-warning {{type inference of a declaration other than a plain identifier with optional trailing attributes is a Clang extension}}
+  auto auto_size = sizeof(auto);            // expected-error {{expected expression}}
+
+  _Static_assert(_Generic(test_char, int : 1));
+  _Static_assert(_Generic(test_char_ptr, char * : 1));
+  _Static_assert(_Generic(test_char_ptr2, char * : 1));
+}
+
+void test_no_integer_promotions(void) {
+  short s;
+  auto a = s;
+  _Generic(a, int : 1); // expected-error {{controlling expression type 'short' not compatible with any generic association type}}
+}
+
+void test_compound_literals(void) {
+  auto a = (int){};
+  auto b = (int){ 0 };
+  auto c = (int){ 0, };
+  auto d = (int){ 0, 1 };       // expected-warning {{excess elements in scalar initializer}}
+
+  auto auto_cl = (auto){13};    // expected-error {{expected expression}}
+
+  _Static_assert(_Generic(a, int : 1));
+  _Static_assert(_Generic(b, int : 1));
+  _Static_assert(_Generic(c, int : 1));
+}
+
+void test_pointers(void) {
+  int a;
+  auto *ptr = &a; // expected-warning {{type inference of a declaration other than a plain identifier with optional trailing attributes is a Clang extension}}
+  auto *ptr2 = a; // expected-error {{variable 'ptr2' with type 'auto *' has incompatible initializer of type 'int'}} \
+                     expected-warning {{type inference of a declaration other than a plain identifier with optional trailing attributes is a Clang extension}}
+  auto nptr = nullptr;
+
+  _Static_assert(_Generic(ptr, int * : 1));
+  _Static_assert(_Generic(ptr2, int * : 1));
+}
+
+void test_scopes(void) {
+  double a = 7;
+  double b = 9;
+  {
+    auto a = a * a; // expected-error {{variable 'a' declared with deduced type 'auto' cannot appear in its own initializer}} \
+                       expected-error {{variable 'a' declared with deduced type 'auto' cannot appear in its own initializer}}
+  }
+  {
+    auto b = a * a;
+    auto a = b;
+
+    _Static_assert(_Generic(a, double : 1));
+    _Static_assert(_Generic(b, double : 1));
+  }
+}
+
+void test_loop(void) {
+  auto j = 4;
+  for (auto i = j; i < 2 * j; i++);
+
+  _Static_assert(_Generic(j, int : 1));
+}
+
+#define AUTO_MACRO(_NAME, ARG, ARG2, ARG3) \
+auto _NAME = ARG + (ARG2 / ARG3);
+
+// This macro should only work with integers due to the usage of binary operators
+#define AUTO_INT_MACRO(_NAME, ARG, ARG2, ARG3) \
+auto _NAME = (ARG ^ ARG2) & ARG3;
+
+void test_macros(int in_int) {
+  auto a = in_int + 1;
+  AUTO_MACRO(b, 1.3, 2.5f, 3);
+  AUTO_INT_MACRO(c, 64, 23, 0xff);
+  AUTO_INT_MACRO(not_valid, 51.5, 25, 0xff); // expected-error {{invalid operands to binary expression ('double' and 'int')}}
+
+  auto result = (a + (int)b) - c;
+
+  _Static_assert(_Generic(a, int : 1));
+  _Static_assert(_Generic(b, double : 1));
+  _Static_assert(_Generic(c, int : 1));
+  _Static_assert(_Generic(result, int : 1));
+}
diff --git a/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p3.cpp b/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p3.cpp
index d545f49fa5db3d..6214ff8006d67f 100644
--- a/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p3.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p3.cpp
@@ -283,3 +283,12 @@ namespace std_example {
     return r;
   }
 }
+
+struct Base {
+ constexpr Base() = default;
+};
+struct Derived : virtual Base { // expected-note 3{{virtual base class declared here}}
+  constexpr Derived() = default; // expected-error {{default constructor cannot be 'constexpr' in a class with virtual base class}}
+  constexpr Derived(const Derived&) = default; // expected-error {{copy constructor cannot be 'constexpr' in a class with virtual base class}}
+  constexpr Derived(Derived&&) = default; // expected-error {{move constructor cannot be 'constexpr' in a class with virtual base class}}
+};
diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/dcl.fct.def.default/p1.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/dcl.fct.def.default/p1.cpp
index 21f71f05419cd0..b1b5b77c4622fc 100644
--- a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/dcl.fct.def.default/p1.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/dcl.fct.def.default/p1.cpp
@@ -5,10 +5,10 @@
 //   -- not have default arguments
 struct DefArg {
   static DefArg &&make();
-  DefArg(int n = 5) = default; // expected-error {{an explicitly-defaulted constructor cannot have default arguments}}
-  DefArg(const DefArg &DA = make()) = default; // expected-error {{an explicitly-defaulted constructor cannot have default arguments}}
+  DefArg(int n = 5) = default; // expected-error {{an explicitly-defaulted default constructor cannot have default arguments}}
+  DefArg(const DefArg &DA = make()) = default; // expected-error {{an explicitly-defaulted default constructor cannot have default arguments}}
   DefArg(const DefArg &DA, int k = 3) = default; // expected-error {{an explicitly-defaulted copy constructor cannot have default arguments}}
-  DefArg(DefArg &&DA = make()) = default; // expected-error {{an explicitly-defaulted constructor cannot have default arguments}}
+  DefArg(DefArg &&DA = make()) = default; // expected-error {{an explicitly-defaulted default constructor cannot have default arguments}}
   DefArg(DefArg &&DA, int k = 3) = default; // expected-error {{an explicitly-defaulted move constructor cannot have default arguments}}
   DefArg &operator=(const DefArg&, int k = 4) = default; // expected-error {{parameter of overloaded 'operator=' cannot have a default argument}}
   DefArg &operator=(DefArg&&, int k = 4) = default; // expected-error {{parameter of overloaded 'operator=' cannot have a default argument}}
diff --git a/clang/test/CXX/drs/dr12xx.cpp b/clang/test/CXX/drs/dr12xx.cpp
index a941366050e1ab..c23a515ba56cb9 100644
--- a/clang/test/CXX/drs/dr12xx.cpp
+++ b/clang/test/CXX/drs/dr12xx.cpp
@@ -32,7 +32,7 @@ namespace dr1213 { // dr1213: 7
 }
 
 #if __cplusplus >= 201103L
-namespace dr1223 { // dr1227: yes open
+namespace dr1223 { // dr1223: 17 drafting
 struct M;
 template <typename T>
 struct V;
diff --git a/clang/test/CXX/drs/dr13xx.cpp b/clang/test/CXX/drs/dr13xx.cpp
index feaf523c44fc27..3510695954e27c 100644
--- a/clang/test/CXX/drs/dr13xx.cpp
+++ b/clang/test/CXX/drs/dr13xx.cpp
@@ -254,6 +254,23 @@ namespace dr1330 { // dr1330: 4 c++11
 #endif
 }
 
+namespace dr1341 { // dr1341: sup P0683R1
+#if __cplusplus >= 202002L
+int a;
+const int b = 0; // #dr1341-b-decl
+struct S {
+  int x1 : 8 = 42;
+  int x2 : 8 { 42 };
+  int y1 : true ? 8 : a = 42;
+  int y2 : true ? 8 : b = 42;
+  // expected-error@-1            {{cannot assign to variable 'b' with const-qualified type 'const int'}}
+  // expected-note@#dr1341-b-decl {{variable 'b' declared const here}}
+  int y3 : (true ? 8 : b) = 42;
+  int z : 1 || new int { 0 };
+};
+#endif
+}
+
 namespace dr1346 { // dr1346: 3.5
   auto a(1); // expected-error 0-1{{extension}}
   auto b(1, 2); // expected-error {{multiple expressions}} expected-error 0-1{{extension}}
diff --git a/clang/test/CXX/drs/dr22xx.cpp b/clang/test/CXX/drs/dr22xx.cpp
index 414925f0d74cc2..cd849443b1119b 100644
--- a/clang/test/CXX/drs/dr22xx.cpp
+++ b/clang/test/CXX/drs/dr22xx.cpp
@@ -123,6 +123,28 @@ namespace CheckAfterMerging2 {
 #endif
 } // namespace dr2233
 
+namespace dr2267 { // dr2267: no
+#if __cplusplus >= 201103L
+struct A {} a;
+struct B { explicit B(const A&); }; // #dr2267-struct-B
+
+struct D { D(); };
+struct C { explicit operator D(); } c;
+
+B b1(a);
+const B &b2{a}; // FIXME ill-formed
+const B &b3(a);
+// expected-error@-1 {{no viable conversion from 'struct A' to 'const B'}}
+// expected-note@#dr2267-struct-B {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'struct A' to 'const B &' for 1st argument}}
+// expected-note@#dr2267-struct-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'struct A' to 'B &&' for 1st argument}}
+// expected-note@#dr2267-struct-B {{explicit constructor is not a candidate}}
+
+D d1(c);
+const D &d2{c}; // FIXME ill-formed
+const D &d3(c); // FIXME ill-formed
+#endif
+}
+
 namespace dr2292 { // dr2292: 9
 #if __cplusplus >= 201103L
   template<typename T> using id = T;
diff --git a/clang/test/CXX/drs/dr25xx.cpp b/clang/test/CXX/drs/dr25xx.cpp
index 68fb4bdaaf2e7d..f1b5a1c26fec1b 100644
--- a/clang/test/CXX/drs/dr25xx.cpp
+++ b/clang/test/CXX/drs/dr25xx.cpp
@@ -81,6 +81,63 @@ using ::dr2521::operator""_div;
 #endif
 } // namespace dr2521
 
+
+#if __cplusplus >= 202302L
+namespace dr2553 { // dr2553: 18
+struct B {
+  virtual void f(this B&);   // expected-error {{an explicit object parameter cannot appear in a virtual function}} \
+                             // expected-note {{here}}
+  static void f(this B&);   // expected-error {{an explicit object parameter cannot appear in a static function}}
+  virtual void g(); // expected-note {{here}}
+};
+struct D : B {
+  void g(this D&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+};
+
+struct D2 : B {
+  void f(this B&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+};
+
+}
+#endif
+
+#if __cplusplus >= 202302L
+namespace dr2554 { // dr2554: 18 review
+struct B {
+  virtual void f(); // expected-note 3{{here}}
+};
+
+struct D : B {
+  void f(this D&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+};
+
+struct D2 : B {
+  void f(this B&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+};
+struct T {};
+struct D3 : B {
+  void f(this T&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+};
+
+}
+#endif
+
+#if __cplusplus >= 202302L
+namespace dr2561 { // dr2561: 18 review
+struct C {
+    constexpr C(auto) { }
+};
+void foo() {
+    constexpr auto b = [](this C) { return 1; };
+    constexpr int (*fp)(C) = b;
+    static_assert(fp(1) == 1);
+    static_assert((&decltype(b)::operator())(1) == 1);
+}
+
+}
+#endif
+
+
 namespace dr2565 { // dr2565: 16
 #if __cplusplus >= 202002L
   template<typename T>
diff --git a/clang/test/CXX/drs/dr26xx.cpp b/clang/test/CXX/drs/dr26xx.cpp
index 8dd07b63deb436..8517cd5872b183 100644
--- a/clang/test/CXX/drs/dr26xx.cpp
+++ b/clang/test/CXX/drs/dr26xx.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify
+// RUN: %clang_cc1 -std=c++2b -triple x86_64-unknown-unknown %s -verify
+
 
 namespace dr2621 { // dr2621: yes
 enum class E { a };
@@ -108,6 +110,7 @@ auto z = [a = 42](int a) { // expected-error {{a lambda parameter cannot shadow
 
 }
 
+#if __cplusplus >= 202302L
 namespace dr2650 { // dr2650: yes
 template <class T, T> struct S {};
 template <class T> int f(S<T, T{}>*); // expected-note {{type 'X' of non-type template parameter is not a structural type}}
@@ -116,6 +119,16 @@ class X {
 };
 int i0 = f<X>(0);   //expected-error {{no matching function for call to 'f'}}
 }
+#endif
+
+#if __cplusplus >= 202302L
+namespace dr2653 { // dr2653: 18
+  struct Test { void f(this const auto& = Test{}); };
+  // expected-error@-1 {{the explicit object parameter cannot have a default argument}}
+  auto L = [](this const auto& = Test{}){};
+  // expected-error@-1 {{the explicit object parameter cannot have a default argument}}
+}
+#endif
 
 namespace dr2654 { // dr2654: 16
 void f() {
@@ -171,3 +184,18 @@ void m() {
   bar(0);
 }
 }
+#if __cplusplus >= 202302L
+namespace dr2687 { // dr2687: 18
+struct S{
+    void f(int);
+    static void g(int);
+    void h(this const S&, int);
+};
+
+void test() {
+    (&S::f)(1); // expected-error {{called object type 'void (dr2687::S::*)(int)' is not a function or function pointer}}
+    (&S::g)(1);
+    (&S::h)(S(), 1);
+}
+}
+#endif
diff --git a/clang/test/CXX/drs/dr5xx.cpp b/clang/test/CXX/drs/dr5xx.cpp
index cba0c4e1241c5d..a3563481eac68e 100644
--- a/clang/test/CXX/drs/dr5xx.cpp
+++ b/clang/test/CXX/drs/dr5xx.cpp
@@ -771,10 +771,15 @@ namespace dr574 { // dr574: yes
   };
 #if __cplusplus >= 201103L
   struct C {
-    C &operator=(const C&) &; // expected-note {{not viable}} expected-note {{candidate}} expected-note {{here}}
+    C &operator=(const C&) &; // #574-overload1 \
+                              // expected-note {{not viable}} \
+                              // expected-note {{here}}
+
   };
   struct D {
-    D &operator=(const D&) &&; // expected-note {{not viable}} expected-note {{candidate}} expected-note {{here}}
+    D &operator=(const D&) &&; // #574-overload2 \
+                               // expected-note {{not viable}} \
+                               // expected-note {{here}}
   };
   void test(C c, D d) {
     c = c;
@@ -786,10 +791,17 @@ namespace dr574 { // dr574: yes
   struct Test {
     friend A &A::operator=(const A&); // expected-error {{does not match}}
     friend B &B::operator=(const B&); // expected-error {{does not match}}
-#if __cplusplus >= 201103L
+#if __cplusplus >= 202302L
+    friend C &C::operator=(const C&); // expected-error {{conflicting types for 'operator='}}
+    friend D &D::operator=(const D&); // expected-error {{conflicting types for 'operator='}} __cplusplus >= 201103L
+#elif __cplusplus >= 201103L
     // FIXME: We shouldn't produce the 'cannot overload' diagnostics here.
-    friend C &C::operator=(const C&); // expected-error {{does not match}} expected-error {{cannot overload}}
-    friend D &D::operator=(const D&); // expected-error {{does not match}} expected-error {{cannot overload}}
+    friend C &C::operator=(const C&); // expected-error {{does not match}} \
+                                      // expected-error {{cannot overload}} \
+                                      // expected-note@#574-overload1 {{candidate}}
+    friend D &D::operator=(const D&); // expected-error {{does not match}} \
+                                      // expected-error {{cannot overload}} \
+                                      // expected-note@#574-overload2 {{candidate}}
 #endif
   };
 }
diff --git a/clang/test/CXX/over/over.match/over.match.funcs/over.match.class.deduct/p2.cpp b/clang/test/CXX/over/over.match/over.match.funcs/over.match.class.deduct/p2.cpp
index 4eac0a1ac510f1..49fde292f6a36d 100644
--- a/clang/test/CXX/over/over.match/over.match.funcs/over.match.class.deduct/p2.cpp
+++ b/clang/test/CXX/over/over.match/over.match.funcs/over.match.class.deduct/p2.cpp
@@ -85,3 +85,38 @@ int main() {
 
 
 }
+
+namespace deduceTemplatedConstructor {
+template <typename X, typename Y> struct IsSame {
+    static constexpr bool value = false;
+};
+
+template <typename Z> struct IsSame<Z, Z> {
+    static constexpr bool value = true;
+};
+template <class T> struct A {
+  using value_type = T;
+  A(value_type);
+  A(const A&);
+  A(T, T, int);
+  template<class U>
+  A(int, T, U);
+};
+
+A x(1, 2, 3);       // no-error
+static_assert(IsSame<decltype(x),A<int>>::value);
+
+template <class T>
+A(T) -> A<T>;
+
+A a(42);
+static_assert(IsSame<decltype(a),A<int>>::value);
+A b = a;
+static_assert(IsSame<decltype(b),A<int>>::value);
+
+template <class T>
+A(A<T>) -> A<A<T>>;
+
+A b2 = a;
+static_assert(IsSame<decltype(b2),A<A<int>>>::value);
+}
diff --git a/clang/test/CXX/special/class.copy/p25-0x.cpp b/clang/test/CXX/special/class.copy/p25-0x.cpp
index 133bb1a87ce263..c13c686bd6b540 100644
--- a/clang/test/CXX/special/class.copy/p25-0x.cpp
+++ b/clang/test/CXX/special/class.copy/p25-0x.cpp
@@ -1,8 +1,16 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s -Wno-deprecated-builtins
 // RUN: %clang_cc1 -std=c++11 -verify %s -Wno-deprecated-builtins -fclang-abi-compat=14 -DCLANG_ABI_COMPAT=14
+// RUN: %clang_cc1 -fsyntax-only -std=c++2b -DDEDUCING_THIS -Wno-deprecated-builtins  %s -verify
 
 // expected-no-diagnostics
 
+#if DEDUCING_THIS
+#define EXPLICIT_PARAMETER(...) this __VA_ARGS__,
+#else
+#define EXPLICIT_PARAMETER(Param)
+#endif
+
+
 template<typename T, bool B> struct trivially_assignable_check {
   static_assert(B == __has_trivial_assign(T), "");
   static_assert(B == __is_trivially_assignable(T&, T), "");
@@ -23,14 +31,18 @@ using _ = trivially_assignable<Trivial>;
 
 // A copy/move assignment operator for class X is trivial if it is not user-provided,
 struct UserProvided {
-  UserProvided &operator=(const UserProvided &);
+  UserProvided &operator=(EXPLICIT_PARAMETER(UserProvided&)
+                          const UserProvided &);
 };
 using _ = not_trivially_assignable<UserProvided>;
 
 // its declared parameter type is the same as if it had been implicitly
 // declared,
 struct NonConstCopy {
-  NonConstCopy &operator=(NonConstCopy &) = default;
+  NonConstCopy &operator=(EXPLICIT_PARAMETER(NonConstCopy&) NonConstCopy &) = default;
+#if DEDUCING_THIS
+  NonConstCopy &operator=(EXPLICIT_PARAMETER(NonConstCopy&&) NonConstCopy &) = default;
+#endif
 };
 #if defined(CLANG_ABI_COMPAT) && CLANG_ABI_COMPAT <= 14
 // Up until (and including) Clang 14, non-const copy assignment operators were not trivial because
@@ -50,9 +62,20 @@ static_assert(!__is_trivially_assignable(NonConstCopy &&, NonConstCopy), "");
 static_assert(!__is_trivially_assignable(NonConstCopy &&, NonConstCopy &&), "");
 
 struct DefaultedSpecialMembers {
-  DefaultedSpecialMembers &operator=(const DefaultedSpecialMembers &) = default;
-  DefaultedSpecialMembers &operator=(DefaultedSpecialMembers &) = default;
-  DefaultedSpecialMembers &operator=(DefaultedSpecialMembers &&) = default;
+  DefaultedSpecialMembers &operator=(EXPLICIT_PARAMETER(DefaultedSpecialMembers&)
+                                     const DefaultedSpecialMembers &) = default;
+  DefaultedSpecialMembers &operator=(EXPLICIT_PARAMETER(DefaultedSpecialMembers&)
+                                     DefaultedSpecialMembers &) = default;
+  DefaultedSpecialMembers &operator=(EXPLICIT_PARAMETER(DefaultedSpecialMembers&)
+                                     DefaultedSpecialMembers &&) = default;
+#if DEDUCING_THIS
+  DefaultedSpecialMembers &operator=(EXPLICIT_PARAMETER(DefaultedSpecialMembers&&)
+                                     const DefaultedSpecialMembers &) = default;
+  DefaultedSpecialMembers &operator=(EXPLICIT_PARAMETER(DefaultedSpecialMembers&&)
+                                     DefaultedSpecialMembers &) = default;
+  DefaultedSpecialMembers &operator=(EXPLICIT_PARAMETER(DefaultedSpecialMembers&&)
+                                     DefaultedSpecialMembers &&) = default;
+#endif
 };
 using _ = trivially_assignable<DefaultedSpecialMembers>;
 #endif
@@ -84,11 +107,16 @@ static_assert(__is_trivially_assignable(MutableTemplateCtorMember, MutableTempla
 
 // Both trivial and non-trivial special members.
 struct TNT {
-  TNT &operator=(const TNT &) = default; // trivial
-  TNT &operator=(TNT &); // non-trivial
-
-  TNT &operator=(TNT &&) = default; // trivial
-  TNT &operator=(const TNT &&); // non-trivial
+  TNT &operator=(EXPLICIT_PARAMETER(TNT&) const TNT &) = default; // trivial
+  TNT &operator=(EXPLICIT_PARAMETER(TNT&) TNT &); // non-trivial
+  TNT &operator=(EXPLICIT_PARAMETER(TNT&) TNT &&) = default; // trivial
+  TNT &operator=(EXPLICIT_PARAMETER(TNT&) const TNT &&); // non-trivial
+#if DEDUCING_THIS
+  TNT &operator=(this TNT&&, const TNT &) = default; // trivial
+  TNT &operator=(this TNT&&, TNT &); // non-trivial
+  TNT &operator=(this TNT&&, TNT &&) = default; // trivial
+  TNT &operator=(this TNT&&, const TNT &&); // non-trivial
+#endif
 };
 
 static_assert(!__has_trivial_assign(TNT), "lie deliberately for gcc compatibility");
diff --git a/clang/test/CodeGen/PowerPC/ppc64-inline-asm.c b/clang/test/CodeGen/PowerPC/ppc64-inline-asm.c
index e3f2c2ff5507ce..005bf5c7fa1440 100644
--- a/clang/test/CodeGen/PowerPC/ppc64-inline-asm.c
+++ b/clang/test/CodeGen/PowerPC/ppc64-inline-asm.c
@@ -47,6 +47,6 @@ void testZ(void *addr) {
 void testZwOff(void *addr, long long off) {
   asm volatile ("dcbz %y0\n" :: "Z"(*(unsigned char *)(addr + off)) : "memory");
 // CHECK-LABEL: void @testZwOff(ptr noundef %addr, i64 noundef %off)
-// CHECK: %[[VAL:[^ ]+]] = getelementptr i8, ptr %addr, i64 %off
+// CHECK: %[[VAL:[^ ]+]] = getelementptr inbounds i8, ptr %addr, i64 %off
 // CHECK: call void asm sideeffect "dcbz ${0:y}\0A", "*Z,~{memory}"(ptr elementtype(i8) %[[VAL]])
 }
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 2d54b506425071..9178ecaf3f8fe4 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -596,6 +596,54 @@ __m256 test_mm256_cmp_ps_true_us(__m256 a, __m256 b) {
   return _mm256_cmp_ps(a, b, _CMP_TRUE_US);
 }
 
+__m128d test_mm_cmp_pd_eq_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_eq_oq
+  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_EQ_OQ);
+}
+
+__m128d test_mm_cmp_pd_lt_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_lt_os
+  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_LT_OS);
+}
+
+__m128d test_mm_cmp_pd_le_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_le_os
+  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_LE_OS);
+}
+
+__m128d test_mm_cmp_pd_unord_q(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_unord_q
+  // CHECK: fcmp uno <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_UNORD_Q);
+}
+
+__m128d test_mm_cmp_pd_neq_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_neq_uq
+  // CHECK: fcmp une <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NEQ_UQ);
+}
+
+__m128d test_mm_cmp_pd_nlt_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_nlt_us
+  // CHECK: fcmp uge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NLT_US);
+}
+
+__m128d test_mm_cmp_pd_nle_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_nle_us
+  // CHECK: fcmp ugt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NLE_US);
+}
+
+__m128d test_mm_cmp_pd_ord_q(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_ord_q
+  // CHECK: fcmp ord <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_ORD_Q);
+}
+
 __m128d test_mm_cmp_pd_eq_uq(__m128d a, __m128d b) {
   // CHECK-LABEL: test_mm_cmp_pd_eq_uq
   // CHECK: fcmp ueq <2 x double> %{{.*}}, %{{.*}}
@@ -740,6 +788,54 @@ __m128d test_mm_cmp_pd_true_us(__m128d a, __m128d b) {
   return _mm_cmp_pd(a, b, _CMP_TRUE_US);
 }
 
+__m128 test_mm_cmp_ps_eq_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_eq_oq
+  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_EQ_OQ);
+}
+
+__m128 test_mm_cmp_ps_lt_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_lt_os
+  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_LT_OS);
+}
+
+__m128 test_mm_cmp_ps_le_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_le_os
+  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_LE_OS);
+}
+
+__m128 test_mm_cmp_ps_unord_q(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_unord_q
+  // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_UNORD_Q);
+}
+
+__m128 test_mm_cmp_ps_neq_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_neq_uq
+  // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NEQ_UQ);
+}
+
+__m128 test_mm_cmp_ps_nlt_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_nlt_us
+  // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NLT_US);
+}
+
+__m128 test_mm_cmp_ps_nle_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_nle_us
+  // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NLE_US);
+}
+
+__m128 test_mm_cmp_ps_ord_q(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_ord_q
+  // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_ORD_Q);
+}
+
 __m128 test_mm_cmp_ps_eq_uq(__m128 a, __m128 b) {
   // CHECK-LABEL: test_mm_cmp_ps_eq_uq
   // CHECK: fcmp ueq <4 x float> %{{.*}}, %{{.*}}
diff --git a/clang/test/CodeGen/X86/bitscan-builtins.c b/clang/test/CodeGen/X86/bitscan-builtins.c
index a5a7808a82a234..9fd46664178519 100644
--- a/clang/test/CodeGen/X86/bitscan-builtins.c
+++ b/clang/test/CodeGen/X86/bitscan-builtins.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-unknown-unknown -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-unknown-unknown -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s
 
 // PR33722
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple x86_64-unknown-unknown -fms-extensions -fms-compatibility-version=19.00 -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple x86_64-unknown-unknown -fms-extensions -fms-compatibility-version=19.00 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple x86_64-unknown-unknown -fms-extensions -fms-compatibility-version=19.00 -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple x86_64-unknown-unknown -fms-extensions -fms-compatibility-version=19.00 -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s
 
 #include <x86intrin.h>
 
diff --git a/clang/test/CodeGen/X86/cmp-avx-builtins-error.c b/clang/test/CodeGen/X86/cmp-avx-builtins-error.c
deleted file mode 100644
index 2b35aa84492b64..00000000000000
--- a/clang/test/CodeGen/X86/cmp-avx-builtins-error.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +avx -emit-llvm -fsyntax-only -verify
-// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown \
-// RUN: -target-feature +avx -emit-llvm -fsyntax-only -verify
-
-#include <immintrin.h>
-
-__m128d test_mm_cmp_pd(__m128d a, __m128d b) {
-  return _mm_cmp_pd(a, b, 32);  // expected-error {{argument value 32 is outside the valid range [0, 31]}}
-}
-
-__m128d test_mm_cmp_sd(__m128d a, __m128d b) {
-  return _mm_cmp_sd(a, b, 32);  // expected-error {{argument value 32 is outside the valid range [0, 31]}}
-}
-
-__m128 test_mm_cmp_ps(__m128 a, __m128 b) {
-  return _mm_cmp_pd(a, b, 32);  // expected-error {{argument value 32 is outside the valid range [0, 31]}}
-}
-
-__m128 test_mm_cmp_ss(__m128 a, __m128 b) {
-  return _mm_cmp_sd(a, b, 32);  // expected-error {{argument value 32 is outside the valid range [0, 31]}}
-}
diff --git a/clang/test/CodeGen/X86/popcnt-builtins.c b/clang/test/CodeGen/X86/popcnt-builtins.c
index e59ffaa031a6a1..b27bc3f0597fb3 100644
--- a/clang/test/CodeGen/X86/popcnt-builtins.c
+++ b/clang/test/CodeGen/X86/popcnt-builtins.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +popcnt -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-POPCNT
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +popcnt -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-POPCNT
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +popcnt -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-POPCNT
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +popcnt -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-POPCNT
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s
 
 #include <x86intrin.h>
 
diff --git a/clang/test/CodeGen/X86/rot-intrinsics.c b/clang/test/CodeGen/X86/rot-intrinsics.c
index f8c78119a1c4a7..5da300b024b5b8 100644
--- a/clang/test/CodeGen/X86/rot-intrinsics.c
+++ b/clang/test/CodeGen/X86/rot-intrinsics.c
@@ -1,16 +1,16 @@
-// RUN: %clang_cc1 -x c -ffreestanding -triple i686--linux -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -x c -ffreestanding -triple x86_64--linux -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-64BIT-LONG
-// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding -triple i686--linux -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding -triple x86_64--linux -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-64BIT-LONG
-// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
-// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c -ffreestanding -triple i686--linux -no-enable-noundef-analysis -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c -ffreestanding -triple x86_64--linux -no-enable-noundef-analysis -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-64BIT-LONG
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -no-enable-noundef-analysis -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -no-enable-noundef-analysis -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -no-enable-noundef-analysis -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -no-enable-noundef-analysis -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding -triple i686--linux -no-enable-noundef-analysis -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding -triple x86_64--linux -no-enable-noundef-analysis -emit-llvm %s -o - | FileCheck %s --check-prefixes CHECK,CHECK-64BIT-LONG
+// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -no-enable-noundef-analysis -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -no-enable-noundef-analysis -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=i686-windows-msvc -target-feature +sse2 -no-enable-noundef-analysis -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
+// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse2 -no-enable-noundef-analysis -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes CHECK,CHECK-32BIT-LONG
 
 #include <x86intrin.h>
 
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 82f67da39e02bb..885c82856522d2 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -813,57 +813,3 @@ __m128 test_mm_xor_ps(__m128 A, __m128 B) {
   // CHECK: xor <4 x i32>
   return _mm_xor_ps(A, B);
 }
-
-__m128 test_mm_cmp_ps_eq_oq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_eq_oq
-  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
-  return _mm_cmp_ps(a, b, _CMP_EQ_OQ);
-}
-
-__m128 test_mm_cmp_ps_lt_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_lt_os
-  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
-  return _mm_cmp_ps(a, b, _CMP_LT_OS);
-}
-
-__m128 test_mm_cmp_ps_le_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_le_os
-  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
-  return _mm_cmp_ps(a, b, _CMP_LE_OS);
-}
-
-__m128 test_mm_cmp_ps_unord_q(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_unord_q
-  // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
-  return _mm_cmp_ps(a, b, _CMP_UNORD_Q);
-}
-
-__m128 test_mm_cmp_ps_neq_uq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_neq_uq
-  // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
-  return _mm_cmp_ps(a, b, _CMP_NEQ_UQ);
-}
-
-__m128 test_mm_cmp_ps_nlt_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_nlt_us
-  // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
-  return _mm_cmp_ps(a, b, _CMP_NLT_US);
-}
-
-__m128 test_mm_cmp_ps_nle_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_nle_us
-  // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
-  return _mm_cmp_ps(a, b, _CMP_NLE_US);
-}
-
-__m128 test_mm_cmp_ps_ord_q(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_ord_q
-  // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
-  return _mm_cmp_ps(a, b, _CMP_ORD_Q);
-}
-
-__m128 test_mm_cmp_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_cmp_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
-  return _mm_cmp_ss(A, B, _CMP_ORD_Q);
-}
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 09b57e3a63727a..7165d2791827cf 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1719,57 +1719,3 @@ __m128i test_mm_xor_si128(__m128i A, __m128i B) {
   // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   return _mm_xor_si128(A, B);
 }
-
-__m128d test_mm_cmp_pd_eq_oq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_eq_oq
-  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
-  return _mm_cmp_pd(a, b, _CMP_EQ_OQ);
-}
-
-__m128d test_mm_cmp_pd_lt_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_lt_os
-  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
-  return _mm_cmp_pd(a, b, _CMP_LT_OS);
-}
-
-__m128d test_mm_cmp_pd_le_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_le_os
-  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
-  return _mm_cmp_pd(a, b, _CMP_LE_OS);
-}
-
-__m128d test_mm_cmp_pd_unord_q(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_unord_q
-  // CHECK: fcmp uno <2 x double> %{{.*}}, %{{.*}}
-  return _mm_cmp_pd(a, b, _CMP_UNORD_Q);
-}
-
-__m128d test_mm_cmp_pd_neq_uq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_neq_uq
-  // CHECK: fcmp une <2 x double> %{{.*}}, %{{.*}}
-  return _mm_cmp_pd(a, b, _CMP_NEQ_UQ);
-}
-
-__m128d test_mm_cmp_pd_nlt_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_nlt_us
-  // CHECK: fcmp uge <2 x double> %{{.*}}, %{{.*}}
-  return _mm_cmp_pd(a, b, _CMP_NLT_US);
-}
-
-__m128d test_mm_cmp_pd_nle_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_nle_us
-  // CHECK: fcmp ugt <2 x double> %{{.*}}, %{{.*}}
-  return _mm_cmp_pd(a, b, _CMP_NLE_US);
-}
-
-__m128d test_mm_cmp_pd_ord_q(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_ord_q
-  // CHECK: fcmp ord <2 x double> %{{.*}}, %{{.*}}
-  return _mm_cmp_pd(a, b, _CMP_ORD_Q);
-}
-
-__m128d test_mm_cmp_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmp_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
-  return _mm_cmp_sd(A, B, _CMP_ORD_Q);
-}
diff --git a/clang/test/CodeGen/X86/x86-bswap.c b/clang/test/CodeGen/X86/x86-bswap.c
index fb4852c524087e..589dd83606983c 100644
--- a/clang/test/CodeGen/X86/x86-bswap.c
+++ b/clang/test/CodeGen/X86/x86-bswap.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -no-enable-noundef-analysis -emit-llvm -o - | FileCheck %s
 
 #include <x86intrin.h>
 
diff --git a/clang/test/CodeGen/aarch64-ls64.c b/clang/test/CodeGen/aarch64-ls64.c
index 39def71d71c7b0..8a61a9643dd3de 100644
--- a/clang/test/CodeGen/aarch64-ls64.c
+++ b/clang/test/CodeGen/aarch64-ls64.c
@@ -206,7 +206,7 @@ EXTERN_C void test_st64b(void)
 // CHECK-CXX-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP13]], align 8
 // CHECK-CXX-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[AGG_TMP]], i32 7
 // CHECK-CXX-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-CXX-NEXT:    [[TMP17:%.*]] = call i64 @llvm.aarch64.st64bv(ptr [[TMP1]], i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], i64 [[TMP16]])
+// CHECK-CXX-NEXT:    [[TMP17:%.*]] = call noundef i64 @llvm.aarch64.st64bv(ptr [[TMP1]], i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], i64 [[TMP16]])
 // CHECK-CXX-NEXT:    store i64 [[TMP17]], ptr @status, align 8
 // CHECK-CXX-NEXT:    ret void
 //
@@ -269,7 +269,7 @@ EXTERN_C void test_st64bv(void)
 // CHECK-CXX-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP13]], align 8
 // CHECK-CXX-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[AGG_TMP]], i32 7
 // CHECK-CXX-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP15]], align 8
-// CHECK-CXX-NEXT:    [[TMP17:%.*]] = call i64 @llvm.aarch64.st64bv0(ptr [[TMP1]], i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], i64 [[TMP16]])
+// CHECK-CXX-NEXT:    [[TMP17:%.*]] = call noundef i64 @llvm.aarch64.st64bv0(ptr [[TMP1]], i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], i64 [[TMP16]])
 // CHECK-CXX-NEXT:    store i64 [[TMP17]], ptr @status, align 8
 // CHECK-CXX-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/address-space.c b/clang/test/CodeGen/address-space.c
index 83a434d88b8e8b..c92fc0dd968706 100644
--- a/clang/test/CodeGen/address-space.c
+++ b/clang/test/CodeGen/address-space.c
@@ -53,7 +53,7 @@ void test4(MyStruct __attribute__((address_space(2))) *pPtr) {
 // X86: [[ALLOCA:%.*]] = alloca ptr addrspace(1)
 // X86-NEXT: store ptr addrspace(1) %arg, ptr [[ALLOCA]]
 // X86-NEXT: load ptr addrspace(1), ptr [[ALLOCA]]
-// X86-NEXT: getelementptr i8, ptr addrspace(1)
+// X86-NEXT: getelementptr inbounds i8, ptr addrspace(1)
 // X86-NEXT: ret ptr addrspace(1)
 void __attribute__((address_space(1)))*
 void_ptr_arithmetic_test(void __attribute__((address_space(1))) *arg) {
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
new file mode 100644
index 00000000000000..a7eb0da6dd282a
--- /dev/null
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -0,0 +1,227 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -DCOUNTED_BY -O2 -Wall -fsanitize=array-bounds,object-size,local-bounds -fstrict-flex-arrays=3 -emit-llvm -o - %s | FileCheck --check-prefix=SANITIZE-WITH-ATTR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -DCOUNTED_BY -O2 -Wall -fstrict-flex-arrays=3 -emit-llvm -o - %s | FileCheck --check-prefix=NO-SANITIZE-WITH-ATTR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -fsanitize=array-bounds,object-size,local-bounds -fstrict-flex-arrays=3 -emit-llvm -o - %s | FileCheck --check-prefix=SANITIZE-WITHOUT-ATTR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -fstrict-flex-arrays=3 -emit-llvm -o - %s | FileCheck --check-prefix=NO-SANITIZE-WITHOUT-ATTR %s
+
+#if !__has_attribute(counted_by)
+#error "has attribute broken"
+#endif
+
+#ifdef COUNTED_BY
+#define __counted_by(member)    __attribute__((__counted_by__(member)))
+#else
+#define __counted_by(member)
+#endif
+
+typedef long unsigned int size_t;
+
+struct annotated {
+  unsigned long flags;
+  int count;
+  int array[] __counted_by(count);
+};
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[COUNT]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64, !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]], !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP3]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF7:![0-9]+]], !nosanitize !6
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = zext i32 [[INDEX]] to i64, !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[TMP4]]) #[[ATTR2:[0-9]+]], !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize !6
+// SANITIZE-WITH-ATTR:       cont7:
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[TMP1]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef writeonly [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr nocapture noundef writeonly [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+void test1(struct annotated *p, int index, int val) {
+  p->array[index] = val;
+}
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[COUNT]], align 8, !tbaa [[TBAA2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[INDEX]], !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT12:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF7]], !nosanitize !6
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR2]], !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize !6
+// SANITIZE-WITH-ATTR:       cont12:
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP0]], 2
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[COUNT]], align 8, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP1]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+void test2(struct annotated *p, size_t index) {
+  p->array[index] = __builtin_dynamic_object_size(p->array, 1);
+}
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[COUNT]], align 8, !tbaa [[TBAA2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[INDEX]], !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT12:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF7]], !nosanitize !6
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[INDEX]]) #[[ATTR2]], !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize !6
+// SANITIZE-WITH-ATTR:       cont12:
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP0]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = add i32 [[TMP3]], 16
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[COUNT]], align 8, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = add i32 [[TMP1]], 16
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+void test3(struct annotated *p, size_t index) {
+  // This test differs from 'test2' by checking bdos on the whole array and not
+  // just the FAM.
+  p->array[index] = __builtin_dynamic_object_size(p, 1);
+}
+
+struct annotated_with_anon_struct {
+  unsigned long flags;
+  struct {
+    unsigned char count;
+    int array[] __counted_by(count);
+  };
+};
+
+// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITH-ATTR-NEXT:  entry:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED_WITH_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TMP0]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sext i32 [[INDEX]] to i64, !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64, !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP2]], [[TMP3]], !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT18:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF7]], !nosanitize !6
+// SANITIZE-WITH-ATTR:       handler.out_of_bounds:
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = zext i32 [[INDEX]] to i64, !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[TMP5]]) #[[ATTR2]], !nosanitize !6
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize !6
+// SANITIZE-WITH-ATTR:       cont18:
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[TMP2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = shl i8 [[TMP1]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = zext i8 [[TMP6]] to i32
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED_WITH_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TMP0]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = shl i8 [[TMP1]], 2
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = zext i8 [[TMP2]] to i32
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret void
+//
+// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test4(
+// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test4(
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
+//
+void test4(struct annotated_with_anon_struct *p, int index) {
+  p->array[index] = __builtin_dynamic_object_size(p->array, 1);
+}
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 8d4a4f137569cc..a7cd6f7bf802c5 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -35,7 +35,7 @@ inline int __attribute__((target_version("sve+sve-bf16"))) fmv_inline(void) { re
 inline int __attribute__((target_version("sve2-aes+sve2-sha3"))) fmv_inline(void) { return 5; }
 inline int __attribute__((target_version("sve2+sve2-pmull128+sve2-bitperm"))) fmv_inline(void) { return 9; }
 inline int __attribute__((target_version("sve2-sm4+memtag2"))) fmv_inline(void) { return 10; }
-inline int __attribute__((target_version("memtag3"))) fmv_inline(void) { return 11; }
+inline int __attribute__((target_version("memtag3+rcpc3"))) fmv_inline(void) { return 11; }
 inline int __attribute__((target_version("default"))) fmv_inline(void) { return 3; }
 
 __attribute__((target_version("ls64"))) int fmv_e(void);
@@ -289,68 +289,68 @@ int hoo(void) {
 // CHECK-NEXT:    ret ptr @fmv_inline._Msha3Mi8mmMf32mm
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 19791209299968
-// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 19791209299968
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 288265560523800576
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 288265560523800576
 // CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
 // CHECK-NEXT:    br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]]
 // CHECK:       resolver_return7:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-sm4Mmemtag2
+// CHECK-NEXT:    ret ptr @fmv_inline._Mrcpc3Mmemtag3
 // CHECK:       resolver_else8:
 // CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 1236950581248
-// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 1236950581248
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 19791209299968
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 19791209299968
 // CHECK-NEXT:    [[TMP23:%.*]] = and i1 true, [[TMP22]]
 // CHECK-NEXT:    br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]]
 // CHECK:       resolver_return9:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-aesMsve2-sha3
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-sm4Mmemtag2
 // CHECK:       resolver_else10:
 // CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 4295098368
-// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 4295098368
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 1236950581248
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 1236950581248
 // CHECK-NEXT:    [[TMP27:%.*]] = and i1 true, [[TMP26]]
 // CHECK-NEXT:    br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]]
 // CHECK:       resolver_return11:
-// CHECK-NEXT:    ret ptr @fmv_inline._MditMsve-ebf16
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-aesMsve2-sha3
 // CHECK:       resolver_else12:
 // CHECK-NEXT:    [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 3221225472
-// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 3221225472
+// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 4295098368
+// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 4295098368
 // CHECK-NEXT:    [[TMP31:%.*]] = and i1 true, [[TMP30]]
 // CHECK-NEXT:    br i1 [[TMP31]], label [[RESOLVER_RETURN13:%.*]], label [[RESOLVER_ELSE14:%.*]]
 // CHECK:       resolver_return13:
-// CHECK-NEXT:    ret ptr @fmv_inline._MsveMsve-bf16
+// CHECK-NEXT:    ret ptr @fmv_inline._MditMsve-ebf16
 // CHECK:       resolver_else14:
 // CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 20971520
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 20971520
+// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 3221225472
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 3221225472
 // CHECK-NEXT:    [[TMP35:%.*]] = and i1 true, [[TMP34]]
 // CHECK-NEXT:    br i1 [[TMP35]], label [[RESOLVER_RETURN15:%.*]], label [[RESOLVER_ELSE16:%.*]]
 // CHECK:       resolver_return15:
-// CHECK-NEXT:    ret ptr @fmv_inline._MrcpcMfrintts
+// CHECK-NEXT:    ret ptr @fmv_inline._MsveMsve-bf16
 // CHECK:       resolver_else16:
 // CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = and i64 [[TMP36]], 8650752
-// CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 8650752
+// CHECK-NEXT:    [[TMP37:%.*]] = and i64 [[TMP36]], 20971520
+// CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 20971520
 // CHECK-NEXT:    [[TMP39:%.*]] = and i1 true, [[TMP38]]
 // CHECK-NEXT:    br i1 [[TMP39]], label [[RESOLVER_RETURN17:%.*]], label [[RESOLVER_ELSE18:%.*]]
 // CHECK:       resolver_return17:
-// CHECK-NEXT:    ret ptr @fmv_inline._MdpbMrcpc2
+// CHECK-NEXT:    ret ptr @fmv_inline._MrcpcMfrintts
 // CHECK:       resolver_else18:
 // CHECK-NEXT:    [[TMP40:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP41:%.*]] = and i64 [[TMP40]], 1572864
-// CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP41]], 1572864
+// CHECK-NEXT:    [[TMP41:%.*]] = and i64 [[TMP40]], 8650752
+// CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP41]], 8650752
 // CHECK-NEXT:    [[TMP43:%.*]] = and i1 true, [[TMP42]]
 // CHECK-NEXT:    br i1 [[TMP43]], label [[RESOLVER_RETURN19:%.*]], label [[RESOLVER_ELSE20:%.*]]
 // CHECK:       resolver_return19:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mdpb2Mjscvt
+// CHECK-NEXT:    ret ptr @fmv_inline._MdpbMrcpc2
 // CHECK:       resolver_else20:
 // CHECK-NEXT:    [[TMP44:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP45:%.*]] = and i64 [[TMP44]], 35184372088832
-// CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[TMP45]], 35184372088832
+// CHECK-NEXT:    [[TMP45:%.*]] = and i64 [[TMP44]], 1572864
+// CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[TMP45]], 1572864
 // CHECK-NEXT:    [[TMP47:%.*]] = and i1 true, [[TMP46]]
 // CHECK-NEXT:    br i1 [[TMP47]], label [[RESOLVER_RETURN21:%.*]], label [[RESOLVER_ELSE22:%.*]]
 // CHECK:       resolver_return21:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mmemtag3
+// CHECK-NEXT:    ret ptr @fmv_inline._Mdpb2Mjscvt
 // CHECK:       resolver_else22:
 // CHECK-NEXT:    ret ptr @fmv_inline
 // CHECK-LABEL: @fmv_e.resolver(
@@ -437,7 +437,7 @@ int hoo(void) {
 // CHECK-LABEL: @fmv_inline._Msve2-sm4Mmemtag2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 10
-// CHECK-LABEL: @fmv_inline._Mmemtag3(
+// CHECK-LABEL: @fmv_inline._Mrcpc3Mmemtag3(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 11
 // CHECK-LABEL: @fmv_inline(
@@ -534,7 +534,7 @@ int hoo(void) {
 // CHECK: attributes #20 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3" }
 // CHECK: attributes #21 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm" }
 // CHECK: attributes #22 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+mte,+neon,+sve,+sve2,+sve2-sm4" }
-// CHECK: attributes #23 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+mte" }
+// CHECK: attributes #23 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+mte,+rcpc,+rcpc3" }
 // CHECK: attributes #24 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+sb" }
 
 // CHECK-NOFMV: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
diff --git a/clang/test/CodeGen/auto.c b/clang/test/CodeGen/auto.c
new file mode 100644
index 00000000000000..e5bc1bf66138f5
--- /dev/null
+++ b/clang/test/CodeGen/auto.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -std=c2x -emit-llvm %s -o - | FileCheck %s
+
+void basic_types(void) {
+  auto nb = 4;          // CHECK: alloca i32
+  auto dbl = 4.3;       // CHECK: alloca double
+  auto lng = 4UL;       // CHECK: alloca i{{32|64}}
+  auto bl = true;       // CHECK: alloca i8
+  auto chr = 'A';       // CHECK: alloca i{{8|32}}
+  auto str = "Test";    // CHECK: alloca ptr
+  auto str2[] = "Test"; // CHECK: alloca [5 x i8]
+  auto nptr = nullptr;  // CHECK: alloca ptr
+}
+
+void misc_declarations(void) {
+  // FIXME: this should end up being rejected when we implement underspecified
+  // declarations in N3006.
+  auto strct_ptr = (struct { int a; } *)0;  // CHECK: alloca ptr
+  auto int_cl = (int){13};                  // CHECK: alloca i32
+  auto double_cl = (double){2.5};           // CHECK: alloca double
+
+  auto se = ({      // CHECK: alloca i32
+    auto snb = 12;  // CHECK: alloca i32
+    snb;
+  });
+}
+
+void loop(void) {
+  auto j = 4;                       // CHECK: alloca i32
+  for (auto i = j; i < 2 * j; i++); // CHECK: alloca i32
+}
+
+#define AUTO_MACRO(_NAME, ARG, ARG2, ARG3) auto _NAME = ARG + (ARG2 / ARG3);
+
+#define AUTO_INT_MACRO(_NAME, ARG, ARG2, ARG3) auto _NAME = (ARG ^ ARG2) & ARG3;
+
+int macros(int in_int) {
+  auto a = in_int + 1;             // CHECK: alloca i32
+  AUTO_MACRO(b, 1.3, 2.5f, 3);     // CHECK: alloca double
+  AUTO_INT_MACRO(c, 64, 23, 0xff); // CHECK: alloca i32
+  return (a + (int)b) - c;         // CHECK: ret i32 %{{.*}}
+}
diff --git a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
index 0e0a9b157464a6..015102940890a5 100644
--- a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
+++ b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
@@ -35,6 +35,7 @@
 // CHECK-SANITIZE-ANYRECOVER-C-DAG: @[[LINE_1400:.*]] = {{.*}}, i32 1400, i32 15 } }
 // CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_1500:.*]] = {{.*}}, i32 1500, i32 15 } }
 // CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_1600:.*]] = {{.*}}, i32 1600, i32 15 } }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_1700:.*]] = {{.*}}, i32 1700, i32 15 } }
 
 #ifdef __cplusplus
 extern "C" {
@@ -427,6 +428,48 @@ char *allones_allones_OK(void) {
   return base + offset;
 }
 
+// C++ does not allow void* arithmetic even as a GNU extension. Replace void*
+// with char* in that case to keep test expectations the same.
+#ifdef __cplusplus
+char *void_ptr(char *base, unsigned long offset) {
+#else
+char *void_ptr(void *base, unsigned long offset) {
+#endif
+  // CHECK: define{{.*}} ptr @void_ptr(ptr noundef %[[BASE:.*]], i64 noundef %[[OFFSET:.*]])
+  // CHECK-NEXT:                      [[ENTRY:.*]]:
+  // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
+  // CHECK-NEXT:                        %[[OFFSET_ADDR:.*]] = alloca i64, align 8
+  // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
+  // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
+  // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
+  // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], %[[COMPUTED_OFFSET]], !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP_IS_NOT_NULL:.*]] = icmp ne i64 %[[COMPUTED_GEP]], 0, !nosanitize
+  // CHECK-SANITIZE-C-NEXT:             %[[BOTH_POINTERS_ARE_NULL_OR_BOTH_ARE_NONNULL:.*]] = and i1 %[[BASE_IS_NOT_NULLPTR]], %[[COMPUTED_GEP_IS_NOT_NULL]], !nosanitize
+  // CHECK-SANITIZE-CPP-NEXT:           %[[BOTH_POINTERS_ARE_NULL_OR_BOTH_ARE_NONNULL:.*]] = icmp eq i1 %[[BASE_IS_NOT_NULLPTR]], %[[COMPUTED_GEP_IS_NOT_NULL]], !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_DID_NOT_OVERFLOW:.*]] = xor i1 %[[OR_OV]], true, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP_IS_UGE_BASE:.*]] = icmp uge i64 %[[COMPUTED_GEP]], %[[BASE_RELOADED_INT]], !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[GEP_DID_NOT_OVERFLOW:.*]] = and i1 %[[COMPUTED_GEP_IS_UGE_BASE]], %[[COMPUTED_OFFSET_DID_NOT_OVERFLOW]], !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[GEP_IS_OKAY:.*]] = and i1 %[[BOTH_POINTERS_ARE_NULL_OR_BOTH_ARE_NONNULL]], %[[GEP_DID_NOT_OVERFLOW]], !nosanitize
+  // CHECK-SANITIZE-NEXT:               br i1 %[[GEP_IS_OKAY]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
+  // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1700]], i64 %[[BASE_RELOADED_INT]], i64 %[[COMPUTED_GEP]])
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1700]], i64 %[[BASE_RELOADED_INT]], i64 %[[COMPUTED_GEP]])
+  // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
+  // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+  // CHECK-SANITIZE:                  [[CONT]]:
+  // CHECK-NEXT:                        ret ptr %[[ADD_PTR]]
+#line 1700
+  return base + offset;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/clang/test/CodeGen/code-coverage.c b/clang/test/CodeGen/code-coverage.c
index af02a6ddaef990..d7994bab35d81a 100644
--- a/clang/test/CodeGen/code-coverage.c
+++ b/clang/test/CodeGen/code-coverage.c
@@ -15,10 +15,10 @@
 // RUN: %clang_cc1 -emit-llvm-bc -o /dev/null -fdebug-pass-manager -coverage-data-file=/dev/null %s 2>&1 | FileCheck --check-prefix=NEWPM %s
 // RUN: %clang_cc1 -emit-llvm-bc -o /dev/null -fdebug-pass-manager -coverage-data-file=/dev/null -O3 %s 2>&1 | FileCheck --check-prefix=NEWPM-O3 %s
 
-// NEWPM-NOT: Running pass
+// NEWPM: Running pass: VerifierPass
 // NEWPM: Running pass: GCOVProfilerPass
 
-// NEWPM-O3-NOT: Running pass
+// NEWPM-O3: Running pass: VerifierPass
 // NEWPM-O3: Running pass: Annotation2MetadataPass
 // NEWPM-O3: Running pass: ForceFunctionAttrsPass
 // NEWPM-O3: Running pass: GCOVProfilerPass
diff --git a/clang/test/CodeGen/constantexpr-fneg.c b/clang/test/CodeGen/constantexpr-fneg.c
index f82b3ff726578b..3cd4db34f61acb 100644
--- a/clang/test/CodeGen/constantexpr-fneg.c
+++ b/clang/test/CodeGen/constantexpr-fneg.c
@@ -8,9 +8,11 @@
 // CHECK:      entry:
 // CHECK-NEXT:   %retval = alloca i32
 // CHECK-NEXT:   store i32 0, ptr %retval
+// CHECK-NEXT:   [[ZEXT:%.*]] = zext i1 true to i32
+// CHECK-NEXT:   [[SITOFP:%.*]] = sitofp i32 [[ZEXT]] to float
 // CHECK-NEXT:   [[LV:%.*]] = load ptr, ptr @c
-// CHECK-NEXT:   store float 1.000000e+00, ptr [[LV]], align 4
-// CHECK-NEXT:   [[FNEG:%.*]] = fneg float 1.000000e+00
+// CHECK-NEXT:   store float [[SITOFP]], ptr [[LV]], align 4
+// CHECK-NEXT:   [[FNEG:%.*]] = fneg float [[SITOFP]]
 // CHECK-NEXT:   [[CONV:%.*]] = fptosi float [[FNEG]] to i32
 // CHECK-NEXT:   ret i32 [[CONV]]
 
diff --git a/clang/test/CodeGen/fp-contract-fast-pragma.cpp b/clang/test/CodeGen/fp-contract-fast-pragma.cpp
index a88ddac4037bb6..0bb01d6e17a1d6 100644
--- a/clang/test/CodeGen/fp-contract-fast-pragma.cpp
+++ b/clang/test/CodeGen/fp-contract-fast-pragma.cpp
@@ -46,7 +46,7 @@ float fp_contract_3(float a, float b, float c) {
   // CHECK: %[[M:.+]] = fmul contract float %a, %b
   // CHECK-NEXT: fadd contract float %[[M]], %c
   // STRICT: %[[M:.+]] = tail call contract float @llvm.experimental.constrained.fmul.f32(float %a, float %b, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  // STRICT-NEXT: tail call contract float @llvm.experimental.constrained.fadd.f32(float %[[M]], float %c, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  // STRICT-NEXT: tail call contract noundef float @llvm.experimental.constrained.fadd.f32(float %[[M]], float %c, metadata !"round.tonearest", metadata !"fpexcept.strict")
   return template_muladd<float>(a, b, c);
 }
 
diff --git a/clang/test/CodeGen/fp-contract-on-pragma.cpp b/clang/test/CodeGen/fp-contract-on-pragma.cpp
index 812a7176b515ce..bfd82a8022e606 100644
--- a/clang/test/CodeGen/fp-contract-on-pragma.cpp
+++ b/clang/test/CodeGen/fp-contract-on-pragma.cpp
@@ -31,7 +31,7 @@ T template_muladd(T a, T b, T c) {
 
 float fp_contract_3(float a, float b, float c) {
   // CHECK: _Z13fp_contract_3fff
-  // CHECK: tail call float @llvm.fmuladd
+  // CHECK: tail call noundef float @llvm.fmuladd
   return template_muladd<float>(a, b, c);
 }
 
diff --git a/clang/test/CodeGen/fp-contract-pragma.cpp b/clang/test/CodeGen/fp-contract-pragma.cpp
index a628d7c1bd228a..c503bf69147c4a 100644
--- a/clang/test/CodeGen/fp-contract-pragma.cpp
+++ b/clang/test/CodeGen/fp-contract-pragma.cpp
@@ -31,7 +31,7 @@ T template_muladd(T a, T b, T c) {
 
 float fp_contract_3(float a, float b, float c) {
 // CHECK: _Z13fp_contract_3fff
-// CHECK: tail call float @llvm.fmuladd
+// CHECK: tail call noundef float @llvm.fmuladd
   return template_muladd<float>(a, b, c);
 }
 
diff --git a/clang/test/CodeGen/large-data-threshold.c b/clang/test/CodeGen/large-data-threshold.c
index 650a7fbb0094e6..29ae19e9b71899 100644
--- a/clang/test/CodeGen/large-data-threshold.c
+++ b/clang/test/CodeGen/large-data-threshold.c
@@ -5,7 +5,7 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -S %s -o - -mcmodel=medium -mlarge-data-threshold=200 | FileCheck %s --check-prefix=ASM-SMALL
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -S %s -o - -mcmodel=medium -mlarge-data-threshold=2 | FileCheck %s --check-prefix=ASM-LARGE
 
-// IR-DEFAULT: !{i32 1, !"Large Data Threshold", i64 0}
+// IR-DEFAULT: !{i32 1, !"Large Data Threshold", i64 65535}
 // IR-CUSTOM: !{i32 1, !"Large Data Threshold", i64 200}
 
 // ASM-SMALL-NOT: movabsq
diff --git a/clang/test/CodeGen/lto-newpm-pipeline.c b/clang/test/CodeGen/lto-newpm-pipeline.c
index 1aaa7d46f3ff06..f58757efbf686f 100644
--- a/clang/test/CodeGen/lto-newpm-pipeline.c
+++ b/clang/test/CodeGen/lto-newpm-pipeline.c
@@ -25,7 +25,9 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm-bc -o /dev/null -mllvm -verify-analysis-invalidation=0 -fdebug-pass-manager -flto=thin -Oz %s 2>&1 | FileCheck %s \
 // RUN:   -check-prefix=CHECK-THIN-OPTIMIZED
 
-// CHECK-FULL-O0: Running pass: AlwaysInlinerPass
+// CHECK-FULL-O0: Running pass: VerifierPass
+// CHECK-FULL-O0-NEXT: Running analysis: VerifierAnalysis
+// CHECK-FULL-O0-NEXT: Running pass: AlwaysInlinerPass
 // CHECK-FULL-O0-NEXT: Running analysis: InnerAnalysisManagerProxy
 // CHECK-FULL-O0-NEXT: Running analysis: ProfileSummaryAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: CoroConditionalWrapper
@@ -34,10 +36,11 @@
 // CHECK-FULL-O0-NEXT: Running pass: AnnotationRemarksPass
 // CHECK-FULL-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: VerifierPass
-// CHECK-FULL-O0-NEXT: Running analysis: VerifierAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: BitcodeWriterPass
 
-// CHECK-THIN-O0: Running pass: AlwaysInlinerPass
+// CHECK-THIN-O0: Running pass: VerifierPass
+// CHECK-THIN-O0-NEXT: Running analysis: VerifierAnalysis
+// CHECK-THIN-O0-NEXT: Running pass: AlwaysInlinerPass
 // CHECK-THIN-O0-NEXT: Running analysis: InnerAnalysisManagerProxy
 // CHECK-THIN-O0-NEXT: Running analysis: ProfileSummaryAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: CoroConditionalWrapper
@@ -46,7 +49,6 @@
 // CHECK-THIN-O0-NEXT: Running pass: AnnotationRemarksPass
 // CHECK-THIN-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: VerifierPass
-// CHECK-THIN-O0-NEXT: Running analysis: VerifierAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: ThinLTOBitcodeWriterPass
 
 // TODO: The LTO pre-link pipeline currently invokes
diff --git a/clang/test/CodeGen/target-features-error-2.c b/clang/test/CodeGen/target-features-error-2.c
index 4f8bc8712aa51d..60586fb57f1c04 100644
--- a/clang/test/CodeGen/target-features-error-2.c
+++ b/clang/test/CodeGen/target-features-error-2.c
@@ -14,8 +14,8 @@ int baz(__m256i a) {
 #endif
 
 #if NEED_AVX_2
-__m256 need_avx(__m256 a, __m256 b) {
-  return _mm256_cmp_ps(a, b, 0); // expected-error {{'__builtin_ia32_cmpps256' needs target feature avx}}
+__m128 need_avx(__m128 a, __m128 b) {
+  return _mm_cmp_ps(a, b, 0); // expected-error {{'__builtin_ia32_cmpps' needs target feature avx}}
 }
 #endif
 
diff --git a/clang/test/CodeGen/ubsan-function-sugared.cpp b/clang/test/CodeGen/ubsan-function-sugared.cpp
index 238bf31f4aba6d..fb2487c024ba90 100644
--- a/clang/test/CodeGen/ubsan-function-sugared.cpp
+++ b/clang/test/CodeGen/ubsan-function-sugared.cpp
@@ -40,5 +40,4 @@ void caller() {
 }
 
 // GNU:  ![[FUNCSAN]] = !{i32 -1056584962, i32 905068220}
-// FIXME: Wrong hash
-// MSVC: ![[FUNCSAN]] = !{i32 -1056584962, i32 165986058}
+// MSVC: ![[FUNCSAN]] = !{i32 -1056584962, i32 -1600339357}
diff --git a/clang/test/CodeGenCUDA/cuda-builtin-vars.cu b/clang/test/CodeGenCUDA/cuda-builtin-vars.cu
index e76e7a2f825290..ba5e5f13ebe707 100644
--- a/clang/test/CodeGenCUDA/cuda-builtin-vars.cu
+++ b/clang/test/CodeGenCUDA/cuda-builtin-vars.cu
@@ -6,21 +6,21 @@
 __attribute__((global))
 void kernel(int *out) {
   int i = 0;
-  out[i++] = threadIdx.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  out[i++] = threadIdx.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  out[i++] = threadIdx.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+  out[i++] = threadIdx.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  out[i++] = threadIdx.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  out[i++] = threadIdx.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 
-  out[i++] = blockIdx.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  out[i++] = blockIdx.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  out[i++] = blockIdx.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+  out[i++] = blockIdx.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  out[i++] = blockIdx.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  out[i++] = blockIdx.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
 
-  out[i++] = blockDim.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  out[i++] = blockDim.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  out[i++] = blockDim.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+  out[i++] = blockDim.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  out[i++] = blockDim.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  out[i++] = blockDim.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
 
-  out[i++] = gridDim.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-  out[i++] = gridDim.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-  out[i++] = gridDim.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+  out[i++] = gridDim.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+  out[i++] = gridDim.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+  out[i++] = gridDim.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
 
   out[i++] = warpSize; // CHECK: store i32 32,
 
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/child-vtable-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/child-vtable-in-comdat.cpp
index 48b1d8ed65d7e5..950921f67509f4 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/child-vtable-in-comdat.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/child-vtable-in-comdat.cpp
@@ -8,7 +8,6 @@
 // CHECK-DAG: $_ZTS1B = comdat any
 // CHECK-DAG: $_ZTI1B = comdat any
 // CHECK-DAG: $_ZTI1B.rtti_proxy = comdat any
-// CHECK-DAG: $_ZTI1A.rtti_proxy = comdat any
 
 // VTable for B is emitted here since we access it when creating an instance of B. The VTable is also linkonce_odr and in its own comdat.
 // CHECK-DAG: @_ZTV1B.local = linkonce_odr hidden unnamed_addr constant { [3 x i32] } { [3 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1B.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1B3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, comdat($_ZTV1B), align 4
@@ -18,7 +17,7 @@
 // CHECK-DAG: @_ZTS1B =
 // CHECK-DAG: @_ZTI1A =
 // CHECK-DAG: @_ZTI1B =
-// CHECK-DAG: @_ZTI1B.rtti_proxy = hidden unnamed_addr constant ptr @_ZTI1B, comdat
+// CHECK-DAG: @_ZTI1B.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1B, comdat
 
 // We will emit a vtable for B here, so it does have an alias, but we will not
 // emit one for A.
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
index 5f5f9edf411a80..ee710100152bfa 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
@@ -18,7 +18,7 @@
 // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
 // CHECK: @_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00", comdat, align 1
 // CHECK: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, comdat, align 8
-// CHECK: @_ZTI1A.rtti_proxy = hidden unnamed_addr constant ptr @_ZTI1A, comdat
+// CHECK: @_ZTI1A.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1A, comdat
 // CHECK: @_ZTV1A = linkonce_odr unnamed_addr alias { [3 x i32] }, ptr @_ZTV1A.local
 
 // CHECK:      define linkonce_odr void @_ZN1A3fooEv(ptr {{.*}}%this) unnamed_addr #{{[0-9]+}} comdat
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-hwasan.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-hwasan.cpp
index d0777b0f1245b6..7657a3bd0efdee 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-hwasan.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-hwasan.cpp
@@ -6,7 +6,7 @@
 /// hwasan-instrumented.
 // CHECK-DAG: @_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1A.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, no_sanitize_hwaddress, align 4
 // CHECK-DAG: @_ZTV1A = unnamed_addr alias { [3 x i32] }, ptr @_ZTV1A.local
-// CHECK-DAG: @_ZTI1A.rtti_proxy = hidden unnamed_addr constant ptr @_ZTI1A, no_sanitize_hwaddress, comdat
+// CHECK-DAG: @_ZTI1A.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1A, no_sanitize_hwaddress, comdat
 
 class A {
 public:
@@ -22,7 +22,7 @@ void A_foo(A *a) {
 /// If the vtable happens to be hidden, then the alias is not needed. In this
 /// case, the original vtable struct itself should be unsanitized.
 // CHECK-DAG: @_ZTV1B = hidden unnamed_addr constant { [3 x i32] } { [3 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1B.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1B, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1B3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1B, i32 0, i32 0, i32 2) to i64)) to i32)] }, no_sanitize_hwaddress, align 4
-// CHECK-DAG: @_ZTI1B.rtti_proxy = hidden unnamed_addr constant ptr @_ZTI1B, no_sanitize_hwaddress, comdat
+// CHECK-DAG: @_ZTI1B.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1B, no_sanitize_hwaddress, comdat
 
 class __attribute__((visibility("hidden"))) B {
 public:
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
index 76de83d9d2cd46..5ed4745b9a0691 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
@@ -14,7 +14,7 @@
 // CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8
 
 // The rtti should be in a comdat
-// CHECK: @_ZTI1A.rtti_proxy = hidden unnamed_addr constant ptr @_ZTI1A, comdat
+// CHECK: @_ZTI1A.rtti_proxy = {{.*}}comdat
 
 // The vtable symbol is exposed through an alias.
 // @_ZTV1A = dso_local unnamed_addr alias { [3 x i32] }, ptr @_ZTV1A.local
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
index 42f28a770e7277..bc10461187b758 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
@@ -13,8 +13,8 @@
 // CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
 // CHECK: @_ZTS1B ={{.*}} constant [3 x i8] c"1B\00", align 1
 // CHECK: @_ZTI1B ={{.*}} constant { ptr, ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i32 8), ptr @_ZTS1B, ptr @_ZTI1A }, align 8
-// CHECK: @_ZTI1A.rtti_proxy = hidden unnamed_addr constant ptr @_ZTI1A, comdat
-// CHECK: @_ZTI1B.rtti_proxy = hidden unnamed_addr constant ptr @_ZTI1B, comdat
+// CHECK: @_ZTI1A.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1A, comdat
+// CHECK: @_ZTI1B.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1B, comdat
 
 // CHECK:      define {{.*}}ptr @_Z11getTypeInfov() local_unnamed_addr
 // CHECK-NEXT: entry:
diff --git a/clang/test/CodeGenCXX/cxx2b-deducing-this-cc.cpp b/clang/test/CodeGenCXX/cxx2b-deducing-this-cc.cpp
new file mode 100644
index 00000000000000..bfbb24fcf23d9a
--- /dev/null
+++ b/clang/test/CodeGenCXX/cxx2b-deducing-this-cc.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -std=c++2b %s -emit-llvm -triple i386-windows-msvc -o - | FileCheck %s
+
+namespace CC {
+
+struct T {
+   static void f(T&);
+   void __cdecl g(this T&);
+   void __thiscall h(this T&);
+   void i(this T&);
+};
+
+void a() {
+    T t;
+    T::f(t);
+}
+// CHECK: define dso_local void @"?a@CC@@YAXXZ"{{.*}}
+// CHECK: call void @"?f@T@CC@@SAXAAU12@@Z"{{.*}}
+
+void b() {
+    T t;
+    t.g();
+}
+// CHECK: define dso_local void @"?b@CC@@YAXXZ"{{.*}}
+// CHECK: call void @"?g@T@CC@@SAX_VAAU12@@Z"{{.*}}
+
+void c() {
+    T t;
+    t.h();
+}
+// CHECK: define dso_local void @"?c@CC@@YAXXZ"{{.*}}
+// CHECK: call x86_thiscallcc void @"?h@T@CC@@SEX_VAAU12@@Z"{{.*}}
+
+void d() {
+    T t;
+    t.i();
+}
+// CHECK: define dso_local void @"?d@CC@@YAXXZ"{{.*}}
+// CHECK: call void @"?i@T@CC@@SAX_VAAU12@@Z"{{.*}}
+
+}
diff --git a/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp b/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp
new file mode 100644
index 00000000000000..de8c124c050eb0
--- /dev/null
+++ b/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -std=c++2b %s -emit-llvm -triple x86_64-linux -o - | FileCheck %s
+
+struct TrivialStruct {
+    void explicit_object_function(this TrivialStruct) {}
+};
+void test() {
+    TrivialStruct s;
+    s.explicit_object_function();
+}
+// CHECK:      define {{.*}}test{{.*}}
+// CHECK-NEXT: entry:
+// CHECK:      {{.*}} = alloca %struct.TrivialStruct, align 1
+// CHECK:      {{.*}} = alloca %struct.TrivialStruct, align 1
+// CHECK:      call void {{.*}}explicit_object_function{{.*}}
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK:      define {{.*}}explicit_object_function{{.*}}
+// CHECK-NEXT: entry:
+// CHECK:        {{.*}} = alloca %struct.TrivialStruct, align 1
+// CHECK:        ret void
+// CHECK-NEXT: }
+
+
+void test_lambda() {
+    [](this auto This) -> int {
+        return This();
+    }();
+}
+
+//CHECK: define dso_local void @{{.*}}test_lambda{{.*}}() #0 {
+//CHECK: entry:
+//CHECK:  %agg.tmp = alloca %class.anon, align 1
+//CHECK:  %ref.tmp = alloca %class.anon, align 1
+//CHECK:  %call = call noundef i32 @"_ZZ11test_lambdavENH3$_0clIS_EEiT_"()
+//CHECK:  ret void
+//CHECK: }
+
+//CHECK: define internal noundef i32 @"_ZZ11test_lambdavENH3$_0clIS_EEiT_"() #0 align 2 {
+//CHECK: entry:
+//CHECK:   %This = alloca %class.anon, align 1
+//CHECK:   %agg.tmp = alloca %class.anon, align 1
+//CHECK:   %call = call noundef i32 @"_ZZ11test_lambdavENH3$_0clIS_EEiT_"()
+//CHECK:   ret i32 %call
+//CHECK: }
+
+void test_lambda_ref() {
+    auto l = [i = 42](this auto & This, int j) -> int {
+        return This(j);
+    };
+    l(0);
+}
+
+// CHECK: define dso_local void @_Z15test_lambda_refv() #0 {
+// CHECK: entry:
+// CHECK:   %[[This_address:.]] = alloca %class.anon{{.*}}, align 4
+// CHECK:   %[[i_addr:.*]] = getelementptr inbounds %class.anon{{.*}}, ptr %[[This_address]], i32 0, i32 0
+// CHECK:   store i32 42, ptr %[[i_addr]], align 4
+// CHECK:   %call = call noundef i32 @"_ZZ15test_lambda_refvENH3$_0clIS_EEiRT_i"{{.*}}
+// CHECK:   ret void
+// CHECK: }
+
+// CHECK: define internal noundef i32 @"_ZZ15test_lambda_refvENH3$_0clIS_EEiRT_i"{{.*}}
+// CHECK: entry:
+// CHECK:  %This.addr = alloca ptr, align 8
+// CHECK:  %j.addr = alloca i32, align 4
+// CHECK:  store ptr %This, ptr %This.addr, align 8
+// CHECK:  store i32 %j, ptr %j.addr, align 4
+// CHECK:  %[[this_addr:.*]] = load ptr, ptr %This.addr, align 8
+// CHECK:  %[[j_addr:.*]] = load i32, ptr %j.addr, align 4
+// CHECK:  %call = call noundef i32 @"_ZZ15test_lambda_refvENH3$_0clIS_EEiRT_i"(ptr noundef nonnull align 4 dereferenceable(4) %[[this_addr]], i32 noundef %[[j_addr]])
+// CHECK:  ret i32 %call
+// CHECK: }
+
+
+struct TestPointer {
+    void f(this TestPointer &);
+};
+
+void test_pointer() {
+    TestPointer t;
+    using Fn = void(TestPointer&);
+    Fn* fn = &TestPointer::f;
+    fn(t);
+}
+//CHECK: define dso_local void @_Z12test_pointerv() #0 {
+//CHECK-NEXT: entry:
+//CHECK-NEXT:  %t = alloca %struct.TestPointer, align 1
+//CHECK-NEXT:  %fn = alloca ptr, align 8
+//CHECK-NEXT:  store ptr @_ZNH11TestPointer1fERS_, ptr %fn, align 8
+//CHECK:       %[[fn_ptr:.*]] = load ptr, ptr %fn, align 8
+//CHECK-NEXT:  call void %[[fn_ptr]](ptr noundef nonnull align 1 dereferenceable(1) %t)
+//CHECK-NEXT:  ret void
+//CHECK-NEXT: }
+
+
+struct MaterializedTemporary {
+  void foo(this MaterializedTemporary&&);
+  MaterializedTemporary();
+  ~MaterializedTemporary();
+};
+
+void test_temporary() {
+  MaterializedTemporary{}.foo();
+}
+
+//CHECK: define dso_local void @_Z14test_temporaryv(){{.*}}
+//CHECK-NEXT: entry:
+//CHECK:    %ref.tmp = alloca %struct.MaterializedTemporary, align 1
+//CHECK:    call void @_ZN21MaterializedTemporaryC1Ev(ptr noundef nonnull align 1 dereferenceable(1) %ref.tmp){{.*}}
+//CHECK     invoke void @_ZNH21MaterializedTemporary3fooEOS_(ptr noundef nonnull align 1 dereferenceable(1) %ref.tmp){{.*}}
diff --git a/clang/test/CodeGenCXX/cxx2b-mangle-deducing-this.cpp b/clang/test/CodeGenCXX/cxx2b-mangle-deducing-this.cpp
new file mode 100644
index 00000000000000..579e757e36fc80
--- /dev/null
+++ b/clang/test/CodeGenCXX/cxx2b-mangle-deducing-this.cpp
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -std=c++2b -emit-llvm -triple x86_64-linux -o - %s 2>/dev/null | FileCheck %s
+
+struct S {
+friend void test();
+public:
+    void a(this auto){}
+    void b(this auto&){}
+    void c(this S){}
+    void c(this S, int){}
+private:
+    void d(this auto){}
+    void e(this auto&){}
+    void f(this S){}
+    void f(this S, int){}
+protected:
+    void g(this auto){}
+    void h(this auto&){}
+    void i(this S){}
+    void i(this S, int){}
+};
+
+
+void test() {
+    S s;
+    s.a();
+    // CHECK: call void @_ZNH1S1aIS_EEvT_
+    s.b();
+    // CHECK: call void @_ZNH1S1bIS_EEvRT_
+    s.c();
+    // CHECK: call void @_ZNH1S1cES_
+    s.c(0);
+    // CHECK: call void @_ZNH1S1cES_i
+    s.d();
+    // CHECK: call void @_ZNH1S1dIS_EEvT_
+    s.e();
+    // CHECK: call void @_ZNH1S1eIS_EEvRT_
+    s.f();
+    // CHECK: call void @_ZNH1S1fES_
+    s.f(0);
+    // CHECK: call void @_ZNH1S1fES_i
+    s.g();
+    // CHECK: call void @_ZNH1S1gIS_EEvT_
+    s.h();
+    // CHECK: call void @_ZNH1S1hIS_EEvRT_
+    s.i();
+    // CHECK: call void @_ZNH1S1iES_
+    s.i(0);
+    // CHECK: call void @_ZNH1S1iES_i
+}
+
+struct StaticAndExplicit {
+  static void f(StaticAndExplicit);
+  void f(this StaticAndExplicit);
+};
+
+void test2() {
+    StaticAndExplicit s;
+
+    StaticAndExplicit::f(s);
+    // CHECK: call void @_ZN17StaticAndExplicit1fES_
+
+    s.f();
+    // CHECK: call void @_ZNH17StaticAndExplicit1fES_
+}
diff --git a/clang/test/CodeGenCXX/microsoft-abi-explicit-object-parameters.cpp b/clang/test/CodeGenCXX/microsoft-abi-explicit-object-parameters.cpp
new file mode 100644
index 00000000000000..b633f6c4ad3f2b
--- /dev/null
+++ b/clang/test/CodeGenCXX/microsoft-abi-explicit-object-parameters.cpp
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -std=c++2b -emit-llvm -triple=x86_64-pc-win32 -o - %s 2>/dev/null | FileCheck %s
+
+struct S {
+friend void test();
+public:
+    void a(this auto){}
+    void b(this auto&){}
+    void c(this S){}
+    void c(this S, int){}
+private:
+    void d(this auto){}
+    void e(this auto&){}
+    void f(this S){}
+    void f(this S, int){}
+protected:
+    void g(this auto){}
+    void h(this auto&){}
+    void i(this S){}
+    void i(this S, int){}
+};
+
+void test() {
+    S s;
+    s.a();
+    // CHECK: call void @"??$a@US@@@S@@SAX_VU0@@Z"
+    s.b();
+    // CHECK: call void @"??$b@US@@@S@@SAX_VAEAU0@@Z"
+    s.c();
+    // CHECK: call void @"?c@S@@SAX_VU1@@Z"
+    s.c(0);
+    // CHECK: call void @"?c@S@@SAX_VU1@H@Z"
+    s.d();
+    // CHECK: call void @"??$d@US@@@S@@CAX_VU0@@Z"
+    s.e();
+    // CHECK: call void @"??$e@US@@@S@@CAX_VAEAU0@@Z"
+    s.f();
+    // CHECK: call void @"?f@S@@CAX_VU1@@Z"
+    s.f(0);
+    // CHECK: call void @"?f@S@@CAX_VU1@H@Z"
+    s.g();
+    // CHECK: call void @"??$g@US@@@S@@KAX_VU0@@Z"
+    s.h();
+    // CHECK: call void @"??$h@US@@@S@@KAX_VAEAU0@@Z"
+    s.i();
+    // CHECK: call void @"?i@S@@KAX_VU1@@Z"
+    s.i(0);
+    // CHECK: call void @"?i@S@@KAX_VU1@H@Z"
+}
+
+
+struct S2 {
+  int i = 0;
+  void foo(this const S2&, int);
+};
+struct T {
+  S2 bar(this const T&, int);
+};
+void chain_test() {
+  T t;
+  t.bar(0).foo(0);
+}
+// CHECK: define {{.*}}chain_test{{.*}}
+// CHECK-NEXT: entry:
+// CHECK: {{.*}} = alloca %struct.T, align 1
+// CHECK: {{.*}} = alloca %struct.S2, align 4
+// CHECK: %call = call i32 @"?bar@T@@SA?AUS2@@_VAEBU1@H@Z"{{.*}}
+// CHECK: %coerce.dive = getelementptr inbounds %struct.S2, {{.*}} %{{.*}}, i32 0, i32 0
+// CHECK  store i32 %call, ptr %coerce.dive, align 4
+// CHECK: call void @"?foo@S2@@SAX_VAEBU1@H@Z"
+// CHECK: ret void
diff --git a/clang/test/CodeGenCXX/strict-vtable-pointers-GH67937.cpp b/clang/test/CodeGenCXX/strict-vtable-pointers-GH67937.cpp
new file mode 100644
index 00000000000000..692ca23a69abe6
--- /dev/null
+++ b/clang/test/CodeGenCXX/strict-vtable-pointers-GH67937.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 %s -I%S -triple=x86_64-pc-windows-msvc -fstrict-vtable-pointers -disable-llvm-passes -O1 -emit-llvm -o %t.ll
+// RUN: FileCheck %s < %t.ll
+
+struct A {
+  virtual ~A();
+};
+struct B : virtual A {};
+class C : B {};
+C foo;
+
+// CHECK-LABEL: define {{.*}} @"??0C@@QEAA@XZ"(ptr {{.*}} %this, i32 {{.*}} %is_most_derived)
+// CHECK: ctor.init_vbases:
+// CHECK-NEXT: %0 = getelementptr inbounds i8, ptr %this1, i64 0
+// CHECK-NEXT: store ptr @"??_8C@@7B@", ptr %0
+// CHECK-NEXT: %1 = call ptr @llvm.launder.invariant.group.p0(ptr %this1)
+// CHECK-NEXT: %2 = getelementptr inbounds i8, ptr %1, i64 8
+// CHECK-NEXT: %call = call noundef ptr @"??0A@@QEAA@XZ"(ptr {{.*}} %2) #2
+// CHECK-NEXT: br label %ctor.skip_vbases
+// CHECK-EMPTY:
+// CHECK-NEXT: ctor.skip_vbases:
+// CHECK-NEXT: %3 = call ptr @llvm.launder.invariant.group.p0(ptr %this1)
+// CHECK-NEXT: %call3 = call noundef ptr @"??0B@@QEAA@XZ"(ptr {{.*}} %3, i32 noundef 0) #2
diff --git a/clang/test/CodeGenCXX/weak-external.cpp b/clang/test/CodeGenCXX/weak-external.cpp
index af636ccb4a3aef..5fc37f73bb4697 100644
--- a/clang/test/CodeGenCXX/weak-external.cpp
+++ b/clang/test/CodeGenCXX/weak-external.cpp
@@ -80,19 +80,23 @@ namespace not_weak_on_first {
 namespace constant_eval {
   [[gnu::weak]] extern int a;
   // CHECK-LABEL: define {{.*}} @__cxx_global_var_init
-  // CHECK:     store i8 zext (i1 icmp ne (ptr @_ZN13constant_eval1aE, ptr null) to i8), ptr @_ZN13constant_eval6has_a1E,
+  // CHECK:     [[ZEXT:%.*]] = zext i1 icmp ne (ptr @_ZN13constant_eval1aE, ptr null) to i8
+  // CHECK:     store i8 [[ZEXT]], ptr @_ZN13constant_eval6has_a1E,
   bool has_a1 = &a;
   // CHECK-LABEL: define {{.*}} @__cxx_global_var_init
-  // CHECK:     store i8 zext (i1 icmp ne (ptr @_ZN13constant_eval1aE, ptr null) to i8), ptr @_ZN13constant_eval6has_a2E,
+  // CHECK:     [[ZEXT:%.*]] = zext i1 icmp ne (ptr @_ZN13constant_eval1aE, ptr null) to i8
+  // CHECK:     store i8 [[ZEXT]], ptr @_ZN13constant_eval6has_a2E,
   bool has_a2 = &a != nullptr;
 
   struct X {
     [[gnu::weak]] void f();
   };
   // CHECK-LABEL: define {{.*}} @__cxx_global_var_init
-  // CHECK:     store i8 zext (i1 icmp ne (i{{32|64}} ptrtoint (ptr @_ZN13constant_eval1X1fEv to i{{32|64}}), i{{32|64}} 0) to i8), ptr @_ZN13constant_eval6has_f1E,
+  // CHECK:     [[ZEXT:%.*]] = zext i1 icmp ne (i{{32|64}} ptrtoint (ptr @_ZN13constant_eval1X1fEv to i{{32|64}}), i{{32|64}} 0) to i8
+  // CHECK:     store i8 [[ZEXT]], ptr @_ZN13constant_eval6has_f1E,
   bool has_f1 = &X::f;
   // CHECK-LABEL: define {{.*}} @__cxx_global_var_init
-  // CHECK:     store i8 zext (i1 icmp ne (i{{32|64}} ptrtoint (ptr @_ZN13constant_eval1X1fEv to i{{32|64}}), i{{32|64}} 0) to i8), ptr @_ZN13constant_eval6has_f2E,
+  // CHECK:     [[ZEXT:%.*]] = zext i1 icmp ne (i{{32|64}} ptrtoint (ptr @_ZN13constant_eval1X1fEv to i{{32|64}}), i{{32|64}} 0) to i8
+  // CHECK:     store i8 [[ZEXT]], ptr @_ZN13constant_eval6has_f2E,
   bool has_f2 = &X::f != nullptr;
 }
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index cb3ca514c584fb..8a04456b5df044 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -606,7 +606,8 @@ int test_and_ptr(private char* p1, local char* p2) {
 // NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob, align 8
 // NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
 // NOOPT: call void @test_fold_callee
-// NOOPT: %{{.*}} = add nsw i64 %1, sext (i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64)
+// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64
+// NOOPT: %{{.*}} = add nsw i64 %1, %[[SEXT]]
 // NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
 void test_fold_callee(void);
 void test_fold_private(void) {
@@ -621,7 +622,8 @@ void test_fold_private(void) {
 // NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob, align 8
 // NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
 // NOOPT: call void @test_fold_callee
-// NOOPT: %{{.*}} = add nsw i64 %{{.*}}, sext (i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64)
+// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64
+// NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
 // NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
 void test_fold_local(void) {
   global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
diff --git a/clang/test/Driver/Inputs/hipstdpar/hipstdpar_lib.hpp b/clang/test/Driver/Inputs/hipstdpar/hipstdpar_lib.hpp
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/clang/test/Driver/Inputs/hipstdpar/rocprim/.keep b/clang/test/Driver/Inputs/hipstdpar/rocprim/.keep
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/clang/test/Driver/Inputs/hipstdpar/rocprim/rocprim/.keep b/clang/test/Driver/Inputs/hipstdpar/rocprim/rocprim/.keep
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/clang/test/Driver/Inputs/hipstdpar/thrust/.keep b/clang/test/Driver/Inputs/hipstdpar/thrust/.keep
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/clang/test/Driver/Inputs/hipstdpar/thrust/thrust/.keep b/clang/test/Driver/Inputs/hipstdpar/thrust/thrust/.keep
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/clang/test/Driver/clang-offload-bundler-zlib.c b/clang/test/Driver/clang-offload-bundler-zlib.c
new file mode 100644
index 00000000000000..c46c32a4a0537b
--- /dev/null
+++ b/clang/test/Driver/clang-offload-bundler-zlib.c
@@ -0,0 +1,75 @@
+// REQUIRES: zlib
+// REQUIRES: x86-registered-target
+// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+
+//
+// Generate the host binary to be bundled.
+//
+// RUN: %clang -O0 -target %itanium_abi_triple %s -c -emit-llvm -o %t.bc
+
+//
+// Generate an empty file to help with the checks of empty files.
+//
+// RUN: touch %t.empty
+
+//
+// Generate device binaries to be bundled.
+//
+// RUN: echo 'Content of device file 1' > %t.tgt1
+// RUN: echo 'Content of device file 2' > %t.tgt2
+
+//
+// Check compression/decompression of offload bundle.
+//
+// RUN: env OFFLOAD_BUNDLER_COMPRESS=1 OFFLOAD_BUNDLER_VERBOSE=1 \
+// RUN:   clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -input=%t.tgt1 -input=%t.tgt2 -output=%t.hip.bundle.bc 2>&1 | \
+// RUN:   FileCheck -check-prefix=COMPRESS %s
+// RUN: clang-offload-bundler -type=bc -list -input=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s
+// RUN: env OFFLOAD_BUNDLER_VERBOSE=1 \
+// RUN:   clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.hip.bundle.bc -unbundle 2>&1 | \
+// RUN:   FileCheck -check-prefix=DECOMPRESS %s
+// RUN: diff %t.tgt1 %t.res.tgt1
+// RUN: diff %t.tgt2 %t.res.tgt2
+
+//
+// COMPRESS: Compression method used:
+// DECOMPRESS: Decompression method:
+// NOHOST-NOT: host-
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa--gfx900
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa--gfx906
+//
+
+//
+// Check -bundle-align option.
+//
+
+// RUN: clang-offload-bundler -bundle-align=4096 -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -input=%t.bc -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle3.bc -compress
+// RUN: clang-offload-bundler -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.bc -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.bundle3.bc -unbundle
+// RUN: diff %t.bc %t.res.bc
+// RUN: diff %t.tgt1 %t.res.tgt1
+// RUN: diff %t.tgt2 %t.res.tgt2
+
+//
+// Check unbundling archive.
+//
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -input=%t.tgt1 -input=%t.tgt2 -output=%T/hip_bundle1.bc -compress
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -input=%t.tgt1 -input=%t.tgt2 -output=%T/hip_bundle2.bc -compress
+// RUN: llvm-ar cr %T/hip_archive.a %T/hip_bundle1.bc %T/hip_bundle2.bc
+// RUN: clang-offload-bundler -unbundle -type=a -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -output=%T/hip_900.a -output=%T/hip_906.a -input=%T/hip_archive.a
+// RUN: llvm-ar t %T/hip_900.a | FileCheck -check-prefix=HIP-AR-900 %s
+// RUN: llvm-ar t %T/hip_906.a | FileCheck -check-prefix=HIP-AR-906 %s
+// HIP-AR-900-DAG: hip_bundle1-hip-amdgcn-amd-amdhsa--gfx900
+// HIP-AR-900-DAG: hip_bundle2-hip-amdgcn-amd-amdhsa--gfx900
+// HIP-AR-906-DAG: hip_bundle1-hip-amdgcn-amd-amdhsa--gfx906
+// HIP-AR-906-DAG: hip_bundle2-hip-amdgcn-amd-amdhsa--gfx906
+
+// Some code so that we can create a binary out of this file.
+int A = 0;
+void test_func(void) {
+  ++A;
+}
diff --git a/clang/test/Driver/clang-offload-bundler-zstd.c b/clang/test/Driver/clang-offload-bundler-zstd.c
new file mode 100644
index 00000000000000..b2b588b72d4d6f
--- /dev/null
+++ b/clang/test/Driver/clang-offload-bundler-zstd.c
@@ -0,0 +1,72 @@
+// REQUIRES: zstd
+// REQUIRES: x86-registered-target
+// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+
+//
+// Generate the host binary to be bundled.
+//
+// RUN: %clang -O0 -target %itanium_abi_triple %s -c -emit-llvm -o %t.bc
+
+//
+// Generate an empty file to help with the checks of empty files.
+//
+// RUN: touch %t.empty
+
+//
+// Generate device binaries to be bundled.
+//
+// RUN: echo 'Content of device file 1' > %t.tgt1
+// RUN: echo 'Content of device file 2' > %t.tgt2
+
+//
+// Check compression/decompression of offload bundle.
+//
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -input=%t.tgt1 -input=%t.tgt2 -output=%t.hip.bundle.bc -compress -verbose 2>&1 | \
+// RUN:   FileCheck -check-prefix=COMPRESS %s
+// RUN: clang-offload-bundler -type=bc -list -input=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.hip.bundle.bc -unbundle -verbose 2>&1 | \
+// RUN:   FileCheck -check-prefix=DECOMPRESS %s
+// RUN: diff %t.tgt1 %t.res.tgt1
+// RUN: diff %t.tgt2 %t.res.tgt2
+//
+// COMPRESS: Compression method used
+// DECOMPRESS: Decompression method
+// NOHOST-NOT: host-
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa--gfx900
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa--gfx906
+//
+
+//
+// Check -bundle-align option.
+//
+
+// RUN: clang-offload-bundler -bundle-align=4096 -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -input=%t.bc -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle3.bc -compress
+// RUN: clang-offload-bundler -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.bc -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.bundle3.bc -unbundle
+// RUN: diff %t.bc %t.res.bc
+// RUN: diff %t.tgt1 %t.res.tgt1
+// RUN: diff %t.tgt2 %t.res.tgt2
+
+//
+// Check unbundling archive.
+//
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -input=%t.tgt1 -input=%t.tgt2 -output=%T/hip_bundle1.bc -compress
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -input=%t.tgt1 -input=%t.tgt2 -output=%T/hip_bundle2.bc -compress
+// RUN: llvm-ar cr %T/hip_archive.a %T/hip_bundle1.bc %T/hip_bundle2.bc
+// RUN: clang-offload-bundler -unbundle -type=a -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -output=%T/hip_900.a -output=%T/hip_906.a -input=%T/hip_archive.a
+// RUN: llvm-ar t %T/hip_900.a | FileCheck -check-prefix=HIP-AR-900 %s
+// RUN: llvm-ar t %T/hip_906.a | FileCheck -check-prefix=HIP-AR-906 %s
+// HIP-AR-900-DAG: hip_bundle1-hip-amdgcn-amd-amdhsa--gfx900
+// HIP-AR-900-DAG: hip_bundle2-hip-amdgcn-amd-amdhsa--gfx900
+// HIP-AR-906-DAG: hip_bundle1-hip-amdgcn-amd-amdhsa--gfx906
+// HIP-AR-906-DAG: hip_bundle2-hip-amdgcn-amd-amdhsa--gfx906
+
+// Some code so that we can create a binary out of this file.
+int A = 0;
+void test_func(void) {
+  ++A;
+}
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 7a3616a2e9f0a4..ebe8a0520bf0fc 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -520,6 +520,11 @@
 // CHECK-COVERAGE-COMPILATION-DIR: "-fcoverage-compilation-dir=."
 // CHECK-COVERAGE-COMPILATION-DIR-NOT: "-ffile-compilation-dir=."
 
+// RUN: %clang -### -S -fverify-intermediate-code %s 2>&1 | FileCheck -check-prefix=CHECK-VERIFY-INTERMEDIATE-CODE %s
+// RUN: %clang -### -S -fno-verify-intermediate-code %s 2>&1 | FileCheck -check-prefix=CHECK-NO-VERIFY-INTERMEDIATE-CODE %s
+// CHECK-VERIFY-INTERMEDIATE-CODE-NOT: "-disable-llvm-verifier"
+// CHECK-NO-VERIFY-INTERMEDIATE-CODE: "-disable-llvm-verifier"
+
 // RUN: %clang -### -S -fdiscard-value-names %s 2>&1 | FileCheck -check-prefix=CHECK-DISCARD-NAMES %s
 // RUN: %clang -### -S -fno-discard-value-names %s 2>&1 | FileCheck -check-prefix=CHECK-NO-DISCARD-NAMES %s
 // CHECK-DISCARD-NAMES: "-discard-value-names"
diff --git a/clang/test/Driver/hip-offload-compress-zlib.hip b/clang/test/Driver/hip-offload-compress-zlib.hip
new file mode 100644
index 00000000000000..a29b6d037350d4
--- /dev/null
+++ b/clang/test/Driver/hip-offload-compress-zlib.hip
@@ -0,0 +1,45 @@
+// REQUIRES: zlib
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// Test compress bundled bitcode.
+
+// RUN: rm -rf %T/a.bc
+// RUN: %clang -c -v --target=x86_64-linux-gnu \
+// RUN:   -x hip --offload-arch=gfx1100 --offload-arch=gfx1101 \
+// RUN:   -fgpu-rdc -nogpuinc -nogpulib \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   --offload-compress --offload-device-only --gpu-bundle-output \
+// RUN:   -o %T/a.bc \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: clang-offload-bundler{{.*}} -type=bc
+// CHECK-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-gfx1100,hip-amdgcn-amd-amdhsa-gfx1101
+// CHECK-SAME: -compress -verbose
+// CHECK: Compressed bundle format
+
+// Test uncompress of bundled bitcode.
+
+// RUN: %clang --hip-link -### -v --target=x86_64-linux-gnu \
+// RUN:   --offload-arch=gfx1100 --offload-arch=gfx1101 \
+// RUN:   -fgpu-rdc -nogpulib \
+// RUN:   %T/a.bc --offload-device-only \
+// RUN: 2>&1 | FileCheck -check-prefix=UNBUNDLE %s
+
+// UNBUNDLE: clang-offload-bundler{{.*}} "-type=bc"
+// UNBUNDLE-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-gfx1100,hip-amdgcn-amd-amdhsa-gfx1101
+// UNBUNDLE-SAME: -unbundle
+// UNBUNDLE-SAME: -verbose
+
+// Test compress bundled code objects.
+
+// RUN: %clang -c -### -v --target=x86_64-linux-gnu \
+// RUN:   -x hip --offload-arch=gfx1100 --offload-arch=gfx1101 \
+// RUN:   -nogpuinc -nogpulib \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   --offload-compress \
+// RUN: 2>&1 | FileCheck -check-prefix=CO %s
+
+// CO: clang-offload-bundler{{.*}} "-type=o"
+// CO-SAME: -targets={{.*}}hipv4-amdgcn-amd-amdhsa--gfx1100,hipv4-amdgcn-amd-amdhsa--gfx1101
+// CO-SAME: "-compress" "-verbose"
diff --git a/clang/test/Driver/hip-offload-compress-zstd.hip b/clang/test/Driver/hip-offload-compress-zstd.hip
new file mode 100644
index 00000000000000..688c2c85329c19
--- /dev/null
+++ b/clang/test/Driver/hip-offload-compress-zstd.hip
@@ -0,0 +1,45 @@
+// REQUIRES: zstd
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// Test compress bundled bitcode.
+
+// RUN: rm -rf %T/a.bc
+// RUN: %clang -c -v --target=x86_64-linux-gnu \
+// RUN:   -x hip --offload-arch=gfx1100 --offload-arch=gfx1101 \
+// RUN:   -fgpu-rdc -nogpuinc -nogpulib \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   --offload-compress --offload-device-only --gpu-bundle-output \
+// RUN:   -o %T/a.bc \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: clang-offload-bundler{{.*}} -type=bc
+// CHECK-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-gfx1100,hip-amdgcn-amd-amdhsa-gfx1101
+// CHECK-SAME: -compress -verbose
+// CHECK: Compressed bundle format
+
+// Test uncompress of bundled bitcode.
+
+// RUN: %clang --hip-link -### -v --target=x86_64-linux-gnu \
+// RUN:   --offload-arch=gfx1100 --offload-arch=gfx1101 \
+// RUN:   -fgpu-rdc -nogpulib \
+// RUN:   %T/a.bc --offload-device-only \
+// RUN: 2>&1 | FileCheck -check-prefix=UNBUNDLE %s
+
+// UNBUNDLE: clang-offload-bundler{{.*}} "-type=bc"
+// UNBUNDLE-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-gfx1100,hip-amdgcn-amd-amdhsa-gfx1101
+// UNBUNDLE-SAME: -unbundle
+// UNBUNDLE-SAME: -verbose
+
+// Test compress bundled code objects.
+
+// RUN: %clang -c -### -v --target=x86_64-linux-gnu \
+// RUN:   -x hip --offload-arch=gfx1100 --offload-arch=gfx1101 \
+// RUN:   -nogpuinc -nogpulib \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   --offload-compress \
+// RUN: 2>&1 | FileCheck -check-prefix=CO %s
+
+// CO: clang-offload-bundler{{.*}} "-type=o"
+// CO-SAME: -targets={{.*}}hipv4-amdgcn-amd-amdhsa--gfx1100,hipv4-amdgcn-amd-amdhsa--gfx1101
+// CO-SAME: "-compress" "-verbose"
diff --git a/clang/test/Driver/hipstdpar.c b/clang/test/Driver/hipstdpar.c
new file mode 100644
index 00000000000000..69c5b177d170cd
--- /dev/null
+++ b/clang/test/Driver/hipstdpar.c
@@ -0,0 +1,25 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: system-linux
+// XFAIL: target={{.*}}hexagon{{.*}}
+// XFAIL: target={{.*}}-scei{{.*}}
+// XFAIL: target={{.*}}-sie{{.*}}
+
+// RUN: not %clang -### --hipstdpar -nogpulib -nogpuinc --compile %s 2>&1 | \
+// RUN:   FileCheck --check-prefix=HIPSTDPAR-MISSING-LIB %s
+// RUN: %clang -### --hipstdpar --hipstdpar-path=%S/Inputs/hipstdpar \
+// RUN:   --hipstdpar-thrust-path=%S/Inputs/hipstdpar/thrust \
+// RUN:   --hipstdpar-prim-path=%S/Inputs/hipstdpar/rocprim \
+// RUN:   -nogpulib -nogpuinc --compile %s 2>&1 | \
+// RUN:   FileCheck --check-prefix=HIPSTDPAR-COMPILE %s
+// RUN: touch %t.o
+// RUN: %clang -### --hipstdpar %t.o 2>&1 | FileCheck --check-prefix=HIPSTDPAR-LINK %s
+
+// HIPSTDPAR-MISSING-LIB: error: cannot find HIP Standard Parallelism Acceleration library; provide it via '--hipstdpar-path'
+// HIPSTDPAR-COMPILE: "-x" "hip"
+// HIPSTDPAR-COMPILE: "-idirafter" "{{.*/thrust}}"
+// HIPSTDPAR-COMPILE: "-idirafter" "{{.*/rocprim}}"
+// HIPSTDPAR-COMPILE: "-idirafter" "{{.*/Inputs/hipstdpar}}"
+// HIPSTDPAR-COMPILE: "-include" "hipstdpar_lib.hpp"
+// HIPSTDPAR-LINK: "-rpath"
+// HIPSTDPAR-LINK: "-l{{.*hip.*}}"
diff --git a/clang/test/Driver/lto-jobs.c b/clang/test/Driver/lto-jobs.c
index 73d7a94dd289cf..43a478b0664d84 100644
--- a/clang/test/Driver/lto-jobs.c
+++ b/clang/test/Driver/lto-jobs.c
@@ -17,3 +17,8 @@
 // RUN: FileCheck -check-prefix=CHECK-LINK-THIN-JOBS2-ACTION < %t %s
 //
 // CHECK-LINK-THIN-JOBS2-ACTION: "-mllvm" "-threads={{[0-9]+}}"
+
+// RUN: %clang --target=powerpc-ibm-aix -### %s -flto=thin -flto-jobs=5 2> %t
+// RUN: FileCheck -check-prefix=CHECK-AIX-LINK-THIN-JOBS-ACTION < %t %s
+//
+// CHECK-AIX-LINK-THIN-JOBS-ACTION: "-bplugin_opt:-threads=5"
diff --git a/clang/test/Driver/mingw-linker-options.c b/clang/test/Driver/mingw-linker-options.c
new file mode 100644
index 00000000000000..b4f1d05a719f3a
--- /dev/null
+++ b/clang/test/Driver/mingw-linker-options.c
@@ -0,0 +1,10 @@
+// RUN: %clang --target=x86_64-windows-gnu -c -mwindows %s -### 2>&1 | FileCheck %s --check-prefix=WARNING
+// RUN: %clang --target=x86_64-windows-gnu -c -mconsole %s -### 2>&1 | FileCheck %s --check-prefix=WARNING
+// RUN: %clang --target=x86_64-windows-gnu -c -mdll %s -### 2>&1 | FileCheck %s --check-prefix=WARNING
+// RUN: %clang --target=x86_64-windows-gnu -c -mthreads %s -### 2>&1 | FileCheck %s --check-prefix=WARNING
+// RUN: not %clang --target=x86_64-windows-msvc -c -mwindows %s -### 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN: not %clang --target=x86_64-windows-msvc -c -mconsole %s -### 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN: not %clang --target=x86_64-windows-msvc -c -mdll %s -### 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN: not %clang --target=x86_64-windows-msvc -c -mthreads %s -### 2>&1 | FileCheck %s --check-prefix=ERROR
+// WARNING: warning: argument unused during compilation: '{{.*}}' [-Wunused-command-line-argument]
+// ERROR: error: unsupported option '{{.*}}' for target '{{.*}}'
diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index 92a6a59cba2317..0ac81ea982f1b6 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -372,24 +372,24 @@
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-ZFHMIN %s
 // RV32-ZFHMIN: "-target-feature" "+zfhmin"
 
-// RUN: not %clang --target=riscv32-unknown-elf -march=rv32izfa -### %s \
+// RUN: not %clang --target=riscv32-unknown-elf -march=rv32iztso -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-EXPERIMENTAL-NOFLAG %s
-// RV32-EXPERIMENTAL-NOFLAG: error: invalid arch name 'rv32izfa'
+// RV32-EXPERIMENTAL-NOFLAG: error: invalid arch name 'rv32iztso'
 // RV32-EXPERIMENTAL-NOFLAG: requires '-menable-experimental-extensions'
 
-// RUN: not %clang --target=riscv32-unknown-elf -march=rv32izfa -menable-experimental-extensions -### %s \
+// RUN: not %clang --target=riscv32-unknown-elf -march=rv32iztso -menable-experimental-extensions -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-EXPERIMENTAL-NOVERS %s
-// RV32-EXPERIMENTAL-NOVERS: error: invalid arch name 'rv32izfa'
+// RV32-EXPERIMENTAL-NOVERS: error: invalid arch name 'rv32iztso'
 // RV32-EXPERIMENTAL-NOVERS: experimental extension requires explicit version number
 
-// RUN: not %clang --target=riscv32-unknown-elf -march=rv32izfa0p1 -menable-experimental-extensions -### %s \
+// RUN: not %clang --target=riscv32-unknown-elf -march=rv32iztso0p7 -menable-experimental-extensions -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-EXPERIMENTAL-BADVERS %s
-// RV32-EXPERIMENTAL-BADVERS: error: invalid arch name 'rv32izfa0p1'
-// RV32-EXPERIMENTAL-BADVERS: unsupported version number 0.1 for experimental extension 'zfa' (this compiler supports 0.2)
+// RV32-EXPERIMENTAL-BADVERS: error: invalid arch name 'rv32iztso0p7'
+// RV32-EXPERIMENTAL-BADVERS: unsupported version number 0.7 for experimental extension 'ztso' (this compiler supports 0.1)
 
-// RUN: %clang --target=riscv32-unknown-elf -march=rv32izfa0p2 -menable-experimental-extensions -### %s \
+// RUN: %clang --target=riscv32-unknown-elf -march=rv32iztso0p1 -menable-experimental-extensions -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-EXPERIMENTAL-GOODVERS %s
-// RV32-EXPERIMENTAL-GOODVERS: "-target-feature" "+experimental-zfa"
+// RV32-EXPERIMENTAL-GOODVERS: "-target-feature" "+experimental-ztso"
 
 // RUN: %clang --target=riscv32-unknown-elf -march=rv32izbb1p0 -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-ZBB %s
diff --git a/clang/test/FixIt/fixit-deducing-this.cpp b/clang/test/FixIt/fixit-deducing-this.cpp
new file mode 100644
index 00000000000000..c073fd38ffc2c5
--- /dev/null
+++ b/clang/test/FixIt/fixit-deducing-this.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -verify -std=c++2c %s
+// RUN: cp %s %t
+// RUN: not %clang_cc1 -x c++ -std=c++2c -fixit %t
+// RUN: %clang_cc1 -x c++ -std=c++2c %t
+// RUN: not %clang_cc1 -std=c++2c -x c++ -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+struct S {
+    void f(this S);
+    void g() {
+        (void)&f;  // expected-error {{must explicitly qualify name of member function when taking its address}}
+// CHECK: fix-it:{{.*}}:{9:16-9:16}:"S::"
+    }
+};
+
+struct S2 {
+    void f(this S2 foo) {
+        g(); // expected-error {{call to non-static member function without an object argument}}
+// CHECK: fix-it:{{.*}}:{16:9-16:9}:"foo."
+
+        h(); // expected-error {{call to explicit member function without an object argument}}
+// CHECK: fix-it:{{.*}}:{19:9-19:9}:"foo."
+
+        i();
+
+        var; // expected-error {{invalid use of member 'var' in explicit object member function}}
+// CHECK: fix-it:{{.*}}:{24:9-24:9}:"foo."
+
+    }
+    void g();
+    void h(this S2 s);
+    static void i();
+    int var;
+};
diff --git a/clang/test/FixIt/format-darwin-enum-class.cpp b/clang/test/FixIt/format-darwin-enum-class.cpp
new file mode 100644
index 00000000000000..5aa1a80d8614c2
--- /dev/null
+++ b/clang/test/FixIt/format-darwin-enum-class.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fsyntax-only -verify -Wformat %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fsyntax-only -fdiagnostics-parseable-fixits -Wformat %s 2>&1 | FileCheck %s
+
+extern "C" int printf(const char * restrict, ...);
+
+#if __LP64__
+typedef long CFIndex;
+typedef long NSInteger;
+typedef unsigned long NSUInteger;
+#else
+typedef int CFIndex;
+typedef int NSInteger;
+typedef unsigned int NSUInteger;
+#endif
+
+enum class CFIndexEnum : CFIndex { One };
+enum class NSIntegerEnum : NSInteger { Two };
+enum class NSUIntegerEnum : NSUInteger { Three };
+
+void f() {
+  printf("%d", CFIndexEnum::One); // expected-warning{{format specifies type 'int' but the argument has type 'CFIndexEnum'}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:13}:"%ld"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:"static_cast<long>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:32-[[@LINE-3]]:32}:")"
+
+  printf("%d", NSIntegerEnum::Two); // expected-warning{{format specifies type 'int' but the argument has type 'NSIntegerEnum'}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:13}:"%ld"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:"static_cast<long>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:34-[[@LINE-3]]:34}:")"
+
+  printf("%d", NSUIntegerEnum::Three); // expected-warning{{format specifies type 'int' but the argument has type 'NSUIntegerEnum'}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:13}:"%lu"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:"static_cast<unsigned long>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:37-[[@LINE-3]]:37}:")"
+}
diff --git a/clang/test/FixIt/format.cpp b/clang/test/FixIt/format.cpp
index 5016ee587ed1c4..4e6573a4f9e54e 100644
--- a/clang/test/FixIt/format.cpp
+++ b/clang/test/FixIt/format.cpp
@@ -12,21 +12,33 @@ struct S {
   N::E Type;
 };
 
+using uint32_t = unsigned;
+enum class FixedE : uint32_t { Two };
+
 void a(N::E NEVal, S *SPtr, S &SRef) {
   printf("%d", N::E::One); // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:16-[[@LINE-1]]:16}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:25-[[@LINE-2]]:25}:")"
 
   printf("%hd", N::E::One); // expected-warning{{format specifies type 'short' but the argument has type 'N::E'}}
-  // CHECK: "static_cast<short>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%d"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:17-[[@LINE-2]]:17}:"static_cast<int>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:26-[[@LINE-3]]:26}:")"
 
   printf("%hu", N::E::One); // expected-warning{{format specifies type 'unsigned short' but the argument has type 'N::E'}}
-  // CHECK: "static_cast<unsigned short>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%d"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:17-[[@LINE-2]]:17}:"static_cast<int>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:26-[[@LINE-3]]:26}:")"
 
   LOG("%d", N::E::One); // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:13-[[@LINE-1]]:13}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:22-[[@LINE-2]]:22}:")"
 
+  LOG("%s", N::E::One); // expected-warning{{format specifies type 'char *' but the argument has type 'N::E'}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:8-[[@LINE-1]]:10}:"%d"
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:13-[[@LINE-2]]:13}:"static_cast<int>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:22-[[@LINE-3]]:22}:")"
+
   printf("%d", NEVal); // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:16-[[@LINE-1]]:16}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:21-[[@LINE-2]]:21}:")"
@@ -58,4 +70,8 @@ void a(N::E NEVal, S *SPtr, S &SRef) {
       SRef.Type);
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:")"
+
+  printf("%u", FixedE::Two); //expected-warning{{format specifies type 'unsigned int' but the argument has type 'FixedE'}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:16-[[@LINE-1]]:16}:"static_cast<uint32_t>("
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:27-[[@LINE-2]]:27}:")"
 }
diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip
index f66f5f3c3ee440..c194f4437890d0 100644
--- a/clang/test/Headers/__clang_hip_cmath.hip
+++ b/clang/test/Headers/__clang_hip_cmath.hip
@@ -18,12 +18,12 @@
 
 // DEFAULT-LABEL: @test_fma_f16(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract half @llvm.fma.f16(half [[X:%.*]], half [[Y:%.*]], half [[Z:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef half @llvm.fma.f16(half [[X:%.*]], half [[Y:%.*]], half [[Z:%.*]])
 // DEFAULT-NEXT:    ret half [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fma_f16(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract half @llvm.fma.f16(half [[X:%.*]], half [[Y:%.*]], half [[Z:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef half @llvm.fma.f16(half [[X:%.*]], half [[Y:%.*]], half [[Z:%.*]])
 // FINITEONLY-NEXT:    ret half [[TMP0]]
 //
 extern "C" __device__ _Float16 test_fma_f16(_Float16 x, _Float16 y,
@@ -33,12 +33,12 @@ extern "C" __device__ _Float16 test_fma_f16(_Float16 x, _Float16 y,
 
 // DEFAULT-LABEL: @test_pow_f16(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract half @__ocml_pown_f16(half noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR7:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef half @__ocml_pown_f16(half noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR7:[0-9]+]]
 // DEFAULT-NEXT:    ret half [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_pow_f16(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) half @__ocml_pown_f16(half noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR7:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) half @__ocml_pown_f16(half noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR7:[0-9]+]]
 // FINITEONLY-NEXT:    ret half [[CALL_I]]
 //
 extern "C" __device__ _Float16 test_pow_f16(_Float16 x, int y) {
@@ -47,12 +47,12 @@ extern "C" __device__ _Float16 test_pow_f16(_Float16 x, int y) {
 
 // DEFAULT-LABEL: @test_fabs_f32(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.fabs.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fabs_f32(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.fabs.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fabs_f32(float x) {
@@ -61,12 +61,12 @@ extern "C" __device__ float test_fabs_f32(float x) {
 
 // DEFAULT-LABEL: @test_sin_f32(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I_I]]
 //
 // FINITEONLY-LABEL: @test_sin_f32(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I_I]]
 //
 extern "C" __device__ float test_sin_f32(float x) {
@@ -75,12 +75,12 @@ extern "C" __device__ float test_sin_f32(float x) {
 
 // DEFAULT-LABEL: @test_cos_f32(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]]
 // DEFAULT-NEXT:    ret float [[CALL_I_I]]
 //
 // FINITEONLY-LABEL: @test_cos_f32(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]]
 // FINITEONLY-NEXT:    ret float [[CALL_I_I]]
 //
 extern "C" __device__ float test_cos_f32(float x) {
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index aa69bac6b624d6..15eccc3b2baba0 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -231,8 +231,8 @@ extern "C" __device__ uint64_t test___make_mantissa(const char *p) {
 
 // CHECK-LABEL: @test_abs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ABS_I:%.*]] = tail call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
-// CHECK-NEXT:    ret i32 [[ABS_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
+// CHECK-NEXT:    ret i32 [[TMP0]]
 //
 extern "C" __device__ int test_abs(int x) {
   return abs(x);
@@ -240,8 +240,8 @@ extern "C" __device__ int test_abs(int x) {
 
 // CHECK-LABEL: @test_labs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ABS_I:%.*]] = tail call i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
-// CHECK-NEXT:    ret i64 [[ABS_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long test_labs(long x) {
   return labs(x);
@@ -249,8 +249,8 @@ extern "C" __device__ long test_labs(long x) {
 
 // CHECK-LABEL: @test_llabs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ABS_I:%.*]] = tail call i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
-// CHECK-NEXT:    ret i64 [[ABS_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long long test_llabs(long x) {
   return llabs(x);
@@ -258,17 +258,17 @@ extern "C" __device__ long long test_llabs(long x) {
 
 // DEFAULT-LABEL: @test_acosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_acosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_acosf(float x) {
@@ -277,17 +277,17 @@ extern "C" __device__ float test_acosf(float x) {
 
 // DEFAULT-LABEL: @test_acos(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acos(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_acos(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_acos(double x) {
@@ -296,17 +296,17 @@ extern "C" __device__ double test_acos(double x) {
 
 // DEFAULT-LABEL: @test_acoshf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acoshf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_acosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_acoshf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR15:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_acoshf(float x) {
@@ -315,17 +315,17 @@ extern "C" __device__ float test_acoshf(float x) {
 
 // DEFAULT-LABEL: @test_acosh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_acosh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_acosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_acosh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_acosh(double x) {
@@ -334,17 +334,17 @@ extern "C" __device__ double test_acosh(double x) {
 
 // DEFAULT-LABEL: @test_asinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_asinf(float x) {
@@ -353,17 +353,17 @@ extern "C" __device__ float test_asinf(float x) {
 
 // DEFAULT-LABEL: @test_asin(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asin(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_asin(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_asin(double x) {
@@ -373,17 +373,17 @@ extern "C" __device__ double test_asin(double x) {
 
 // DEFAULT-LABEL: @test_asinhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_asinh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_asinhf(float x) {
@@ -392,17 +392,17 @@ extern "C" __device__ float test_asinhf(float x) {
 
 // DEFAULT-LABEL: @test_asinh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_asinh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_asinh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_asinh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_asinh(double x) {
@@ -411,17 +411,17 @@ extern "C" __device__ double test_asinh(double x) {
 
 // DEFAULT-LABEL: @test_atan2f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan2f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan2_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan2f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atan2f(float x, float y) {
@@ -430,17 +430,17 @@ extern "C" __device__ float test_atan2f(float x, float y) {
 
 // DEFAULT-LABEL: @test_atan2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan2_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atan2(double x, double y) {
@@ -449,17 +449,17 @@ extern "C" __device__ double test_atan2(double x, double y) {
 
 // DEFAULT-LABEL: @test_atanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atanf(float x) {
@@ -468,17 +468,17 @@ extern "C" __device__ float test_atanf(float x) {
 
 // DEFAULT-LABEL: @test_atan(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atan(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atan(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atan(double x) {
@@ -487,17 +487,17 @@ extern "C" __device__ double test_atan(double x) {
 
 // DEFAULT-LABEL: @test_atanhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_atanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_atanhf(float x) {
@@ -506,17 +506,17 @@ extern "C" __device__ float test_atanhf(float x) {
 
 // DEFAULT-LABEL: @test_atanh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_atanh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_atanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_atanh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_atanh(double x) {
@@ -525,17 +525,17 @@ extern "C" __device__ double test_atanh(double x) {
 
 // DEFAULT-LABEL: @test_cbrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cbrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cbrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cbrtf(float x) {
@@ -544,17 +544,17 @@ extern "C" __device__ float test_cbrtf(float x) {
 
 // DEFAULT-LABEL: @test_cbrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cbrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cbrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cbrt(double x) {
@@ -563,17 +563,17 @@ extern "C" __device__ double test_cbrt(double x) {
 
 // DEFAULT-LABEL: @test_ceilf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ceil.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_ceilf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.ceil.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ceil.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_ceilf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ceil.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_ceilf(float x) {
@@ -582,17 +582,17 @@ extern "C" __device__ float test_ceilf(float x) {
 
 // DEFAULT-LABEL: @test_ceil(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ceil.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_ceil(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.ceil.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ceil.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_ceil(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ceil.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_ceil(double x) {
@@ -601,17 +601,17 @@ extern "C" __device__ double test_ceil(double x) {
 
 // DEFAULT-LABEL: @test_copysignf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_copysignf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_copysignf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_copysignf(float x, float y) {
@@ -620,17 +620,17 @@ extern "C" __device__ float test_copysignf(float x, float y) {
 
 // DEFAULT-LABEL: @test_copysign(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_copysign(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_copysign(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_copysign(double x, double y) {
@@ -639,17 +639,17 @@ extern "C" __device__ double test_copysign(double x, double y) {
 
 // DEFAULT-LABEL: @test_cosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16:[0-9]+]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
 // APPROX-NEXT:    ret float [[CALL_I_I]]
 //
 extern "C" __device__ float test_cosf(float x) {
@@ -658,17 +658,17 @@ extern "C" __device__ float test_cosf(float x) {
 
 // DEFAULT-LABEL: @test_cos(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cos(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cos_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cos(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cos(double x) {
@@ -677,17 +677,17 @@ extern "C" __device__ double test_cos(double x) {
 
 // DEFAULT-LABEL: @test_coshf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_coshf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cosh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_coshf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_coshf(float x) {
@@ -696,17 +696,17 @@ extern "C" __device__ float test_coshf(float x) {
 
 // DEFAULT-LABEL: @test_cosh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cosh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cosh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cosh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cosh(double x) {
@@ -715,17 +715,17 @@ extern "C" __device__ double test_cosh(double x) {
 
 // DEFAULT-LABEL: @test_cospif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cospif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cospi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cospif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cospif(float x) {
@@ -734,17 +734,17 @@ extern "C" __device__ float test_cospif(float x) {
 
 // DEFAULT-LABEL: @test_cospi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cospi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_cospi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cospi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cospi(double x) {
@@ -753,17 +753,17 @@ extern "C" __device__ double test_cospi(double x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cyl_bessel_i0f(float x) {
@@ -772,17 +772,17 @@ extern "C" __device__ float test_cyl_bessel_i0f(float x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cyl_bessel_i0(double x) {
@@ -791,17 +791,17 @@ extern "C" __device__ double test_cyl_bessel_i0(double x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_i1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_cyl_bessel_i1f(float x) {
@@ -810,17 +810,17 @@ extern "C" __device__ float test_cyl_bessel_i1f(float x) {
 
 // DEFAULT-LABEL: @test_cyl_bessel_i1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_cyl_bessel_i1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_i1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_cyl_bessel_i1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_cyl_bessel_i1(double x) {
@@ -829,17 +829,17 @@ extern "C" __device__ double test_cyl_bessel_i1(double x) {
 
 // DEFAULT-LABEL: @test_erfcf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfcf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfc_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfcf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_erfcf(float x) {
@@ -848,17 +848,17 @@ extern "C" __device__ float test_erfcf(float x) {
 
 // DEFAULT-LABEL: @test_erfc(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfc(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfc_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfc(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_erfc(double x) {
@@ -867,17 +867,17 @@ extern "C" __device__ double test_erfc(double x) {
 
 // DEFAULT-LABEL: @test_erfinvf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfinvf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_erfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfinvf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_erfinvf(float x) {
@@ -886,17 +886,17 @@ extern "C" __device__ float test_erfinvf(float x) {
 
 // DEFAULT-LABEL: @test_erfinv(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_erfinv(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_erfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_erfinv(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_erfinv(double x) {
@@ -905,17 +905,17 @@ extern "C" __device__ double test_erfinv(double x) {
 
 // DEFAULT-LABEL: @test_exp10f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp10f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_exp10_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_exp10_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp10f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_exp10f(float x) {
@@ -924,17 +924,17 @@ extern "C" __device__ float test_exp10f(float x) {
 
 // DEFAULT-LABEL: @test_exp10(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp10(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp10(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp10(double x) {
@@ -943,17 +943,17 @@ extern "C" __device__ double test_exp10(double x) {
 
 // DEFAULT-LABEL: @test_exp2f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.exp2.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_exp2f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.exp2.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp2.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_exp2f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.exp2.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_exp2f(float x) {
@@ -962,17 +962,17 @@ extern "C" __device__ float test_exp2f(float x) {
 
 // DEFAULT-LABEL: @test_exp2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp2(double x) {
@@ -981,17 +981,17 @@ extern "C" __device__ double test_exp2(double x) {
 
 // DEFAULT-LABEL: @test_expf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.exp.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_expf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.exp.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_expf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.exp.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_expf(float x) {
@@ -1000,17 +1000,17 @@ extern "C" __device__ float test_expf(float x) {
 
 // DEFAULT-LABEL: @test_exp(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_exp(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_exp_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_exp(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_exp(double x) {
@@ -1019,17 +1019,17 @@ extern "C" __device__ double test_exp(double x) {
 
 // DEFAULT-LABEL: @test_expm1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_expm1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_expm1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_expm1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_expm1f(float x) {
@@ -1038,17 +1038,17 @@ extern "C" __device__ float test_expm1f(float x) {
 
 // DEFAULT-LABEL: @test_expm1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_expm1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_expm1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_expm1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_expm1(double x) {
@@ -1057,17 +1057,17 @@ extern "C" __device__ double test_expm1(double x) {
 
 // DEFAULT-LABEL: @test_fabsf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.fabs.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fabsf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.fabs.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_fabsf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.fabs.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fabsf(float x) {
@@ -1076,17 +1076,17 @@ extern "C" __device__ float test_fabsf(float x) {
 
 // DEFAULT-LABEL: @test_fabs(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.fabs.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fabs(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.fabs.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fabs.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_fabs(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.fabs.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fabs(double x) {
@@ -1095,17 +1095,17 @@ extern "C" __device__ double test_fabs(double x) {
 
 // DEFAULT-LABEL: @test_fdimf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fdimf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fdim_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_fdimf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_fdimf(float x, float y) {
@@ -1114,17 +1114,17 @@ extern "C" __device__ float test_fdimf(float x, float y) {
 
 // DEFAULT-LABEL: @test_fdim(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fdim(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fdim_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_fdim(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_fdim(double x, double y) {
@@ -1152,17 +1152,17 @@ extern "C" __device__ float test_fdividef(float x, float y) {
 
 // DEFAULT-LABEL: @test_floorf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.floor.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_floorf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.floor.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.floor.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_floorf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.floor.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_floorf(float x) {
@@ -1171,17 +1171,17 @@ extern "C" __device__ float test_floorf(float x) {
 
 // DEFAULT-LABEL: @test_floor(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.floor.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_floor(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.floor.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.floor.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_floor(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.floor.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_floor(double x) {
@@ -1190,17 +1190,17 @@ extern "C" __device__ double test_floor(double x) {
 
 // DEFAULT-LABEL: @test_fmaf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fmaf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_fmaf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fmaf(float x, float y, float z) {
@@ -1209,17 +1209,17 @@ extern "C" __device__ float test_fmaf(float x, float y, float z) {
 
 // DEFAULT-LABEL: @test_fma(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fma(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_fma(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fma(double x, double y, double z) {
@@ -1228,17 +1228,17 @@ extern "C" __device__ double test_fma(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_fma_rn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fma_rn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_fma_rn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fma_rn(double x, double y, double z) {
@@ -1247,17 +1247,17 @@ extern "C" __device__ double test_fma_rn(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_fmaxf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fmaxf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_fmaxf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fmaxf(float x, float y) {
@@ -1266,17 +1266,17 @@ extern "C" __device__ float test_fmaxf(float x, float y) {
 
 // DEFAULT-LABEL: @test_fmax(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fmax(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_fmax(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fmax(double x, double y) {
@@ -1285,17 +1285,17 @@ extern "C" __device__ double test_fmax(double x, double y) {
 
 // DEFAULT-LABEL: @test_fminf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fminf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_fminf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_fminf(float x, float y) {
@@ -1304,17 +1304,17 @@ extern "C" __device__ float test_fminf(float x, float y) {
 
 // DEFAULT-LABEL: @test_fmin(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_fmin(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_fmin(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_fmin(double x, double y) {
@@ -1323,17 +1323,17 @@ extern "C" __device__ double test_fmin(double x, double y) {
 
 // DEFAULT-LABEL: @test_fmodf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fmodf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_fmod_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_fmodf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_fmodf(float x, float y) {
@@ -1342,17 +1342,17 @@ extern "C" __device__ float test_fmodf(float x, float y) {
 
 // DEFAULT-LABEL: @test_fmod(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_fmod(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_fmod_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_fmod(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_fmod(double x, double y) {
@@ -1385,17 +1385,17 @@ extern "C" __device__ double test_frexp(double x, int* y) {
 
 // DEFAULT-LABEL: @test_hypotf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_hypotf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_hypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_hypotf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_hypotf(float x, float y) {
@@ -1404,17 +1404,17 @@ extern "C" __device__ float test_hypotf(float x, float y) {
 
 // DEFAULT-LABEL: @test_hypot(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_hypot(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_hypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_hypot(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_hypot(double x, double y) {
@@ -1423,17 +1423,17 @@ extern "C" __device__ double test_hypot(double x, double y) {
 
 // DEFAULT-LABEL: @test_ilogbf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret i32 [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_ilogbf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret i32 [[CALL_I]]
 //
 // APPROX-LABEL: @test_ilogbf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret i32 [[CALL_I]]
 //
 extern "C" __device__ int test_ilogbf(float x) {
@@ -1442,17 +1442,17 @@ extern "C" __device__ int test_ilogbf(float x) {
 
 // DEFAULT-LABEL: @test_ilogb(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret i32 [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_ilogb(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret i32 [[CALL_I]]
 //
 // APPROX-LABEL: @test_ilogb(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret i32 [[CALL_I]]
 //
 extern "C" __device__ int test_ilogb(double x) {
@@ -1589,17 +1589,17 @@ extern "C" __device__ BOOL_TYPE test___isnan(double x) {
 
 // DEFAULT-LABEL: @test_j0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_j0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_j0f(float x) {
@@ -1608,17 +1608,17 @@ extern "C" __device__ float test_j0f(float x) {
 
 // DEFAULT-LABEL: @test_j0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_j0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_j0(double x) {
@@ -1627,17 +1627,17 @@ extern "C" __device__ double test_j0(double x) {
 
 // DEFAULT-LABEL: @test_j1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_j1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_j1f(float x) {
@@ -1646,17 +1646,17 @@ extern "C" __device__ float test_j1f(float x) {
 
 // DEFAULT-LABEL: @test_j1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_j1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_j1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_j1(double x) {
@@ -1670,14 +1670,14 @@ extern "C" __device__ double test_j1(double x) {
 // DEFAULT-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // DEFAULT:       for.body.i:
@@ -1703,14 +1703,14 @@ extern "C" __device__ double test_j1(double x) {
 // FINITEONLY-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -1736,14 +1736,14 @@ extern "C" __device__ double test_j1(double x) {
 // APPROX-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // APPROX:       for.body.i:
@@ -1773,14 +1773,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // DEFAULT-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // DEFAULT:       for.body.i:
@@ -1806,14 +1806,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // FINITEONLY-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -1839,14 +1839,14 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // APPROX-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL2JNID_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // APPROX:       for.body.i:
@@ -1871,17 +1871,17 @@ extern "C" __device__ double test_jn(int x, double y) {
 
 // DEFAULT-LABEL: @test_ldexpf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_ldexpf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_ldexpf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_ldexpf(float x, int y) {
@@ -1890,17 +1890,17 @@ extern "C" __device__ float test_ldexpf(float x, int y) {
 
 // DEFAULT-LABEL: @test_ldexp(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_ldexp(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_ldexp(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_ldexp(double x, int y) {
@@ -1909,17 +1909,17 @@ extern "C" __device__ double test_ldexp(double x, int y) {
 
 // DEFAULT-LABEL: @test_lgammaf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_lgammaf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_lgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_lgammaf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_lgammaf(float x) {
@@ -1928,17 +1928,17 @@ extern "C" __device__ float test_lgammaf(float x) {
 
 // DEFAULT-LABEL: @test_lgamma(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_lgamma(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_lgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_lgamma(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_lgamma(double x) {
@@ -2035,17 +2035,17 @@ extern "C" __device__ long long int test_llround(double x) {
 
 // DEFAULT-LABEL: @test_log10f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.log10.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_log10f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.log10.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_log10f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.log10.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_log10f(float x) {
@@ -2054,17 +2054,17 @@ extern "C" __device__ float test_log10f(float x) {
 
 // DEFAULT-LABEL: @test_log10(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log10(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log10_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log10(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log10(double x) {
@@ -2073,17 +2073,17 @@ extern "C" __device__ double test_log10(double x) {
 
 // DEFAULT-LABEL: @test_log1pf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log1pf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log1p_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_log1pf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_log1pf(float x) {
@@ -2092,17 +2092,17 @@ extern "C" __device__ float test_log1pf(float x) {
 
 // DEFAULT-LABEL: @test_log1p(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log1p(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log1p_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log1p(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log1p(double x) {
@@ -2111,17 +2111,17 @@ extern "C" __device__ double test_log1p(double x) {
 
 // DEFAULT-LABEL: @test_log2f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log2f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_log2_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log2_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_log2f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_log2f(float x) {
@@ -2130,17 +2130,17 @@ extern "C" __device__ float test_log2f(float x) {
 
 // DEFAULT-LABEL: @test_log2(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_log2(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_log2_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_log2(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_log2(double x) {
@@ -2149,17 +2149,17 @@ extern "C" __device__ double test_log2(double x) {
 
 // DEFAULT-LABEL: @test_logbf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logbf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_logb_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_logbf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_logbf(float x) {
@@ -2168,17 +2168,17 @@ extern "C" __device__ float test_logbf(float x) {
 
 // DEFAULT-LABEL: @test_logb(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logb(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_logb_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_logb(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_logb(double x) {
@@ -2187,17 +2187,17 @@ extern "C" __device__ double test_logb(double x) {
 
 // DEFAULT-LABEL: @test_logf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_logf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_log_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_log_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_logf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.log.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_logf(float x) {
@@ -2296,7 +2296,7 @@ extern "C" __device__ long int test_lround(double x) {
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // DEFAULT-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -2306,7 +2306,7 @@ extern "C" __device__ long int test_lround(double x) {
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_modf_f32(float noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // FINITEONLY-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -2316,7 +2316,7 @@ extern "C" __device__ long int test_lround(double x) {
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca float, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17:[0-9]+]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_modf_f32(float noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // APPROX-NEXT:    store float [[TMP0]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -2330,7 +2330,7 @@ extern "C" __device__ float test_modff(float x, float* y) {
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // DEFAULT-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -2340,7 +2340,7 @@ extern "C" __device__ float test_modff(float x, float* y) {
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_modf_f64(double noundef nofpclass(nan inf) [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // FINITEONLY-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -2350,7 +2350,7 @@ extern "C" __device__ float test_modff(float x, float* y) {
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca double, align 8, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_modf_f64(double noundef [[X:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr addrspace(5) [[__TMP_I]], align 8, !tbaa [[TBAA18:![0-9]+]]
 // APPROX-NEXT:    store double [[TMP0]], ptr [[Y:%.*]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -2557,33 +2557,65 @@ extern "C" __device__ double test_nan(const char *tag) {
   return nan(tag);
 }
 
-// CHECK-LABEL: @test_nanf_emptystr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret float 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nanf_emptystr(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret float 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nanf_emptystr(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret float poison
+//
+// APPROX-LABEL: @test_nanf_emptystr(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret float 0x7FF8000000000000
 //
 extern "C" __device__ float test_nanf_emptystr() {
   return nanf("");
 }
 
-// CHECK-LABEL: @test_nan_emptystr(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret double 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nan_emptystr(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret double 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nan_emptystr(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret double poison
+//
+// APPROX-LABEL: @test_nan_emptystr(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret double 0x7FF8000000000000
 //
 extern "C" __device__ double test_nan_emptystr() {
   return nan("");
 }
 
-// CHECK-LABEL: @test_nanf_fill(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret float 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nanf_fill(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret float 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nanf_fill(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret float poison
+//
+// APPROX-LABEL: @test_nanf_fill(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret float 0x7FF8000000000000
 //
 extern "C" __device__ float test_nanf_fill() {
   return nanf("0x456");
 }
 
-// CHECK-LABEL: @test_nan_fill(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret double 0x7FF8000000000000
+// DEFAULT-LABEL: @test_nan_fill(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    ret double 0x7FF8000000000000
+//
+// FINITEONLY-LABEL: @test_nan_fill(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret double poison
+//
+// APPROX-LABEL: @test_nan_fill(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    ret double 0x7FF8000000000000
 //
 extern "C" __device__ double test_nan_fill() {
   return nan("0x123");
@@ -2591,17 +2623,17 @@ extern "C" __device__ double test_nan_fill() {
 
 // DEFAULT-LABEL: @test_nearbyintf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.nearbyint.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_nearbyintf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.nearbyint.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.nearbyint.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_nearbyintf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.nearbyint.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_nearbyintf(float x) {
@@ -2610,17 +2642,17 @@ extern "C" __device__ float test_nearbyintf(float x) {
 
 // DEFAULT-LABEL: @test_nearbyint(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.nearbyint.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_nearbyint(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.nearbyint.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.nearbyint.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_nearbyint(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.nearbyint.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_nearbyint(double x) {
@@ -2629,17 +2661,17 @@ extern "C" __device__ double test_nearbyint(double x) {
 
 // DEFAULT-LABEL: @test_nextafterf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_nextafterf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_nextafter_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_nextafterf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_nextafterf(float x, float y) {
@@ -2648,17 +2680,17 @@ extern "C" __device__ float test_nextafterf(float x, float y) {
 
 // DEFAULT-LABEL: @test_nextafter(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_nextafter(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_nextafter_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_nextafter(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_nextafter(double x, double y) {
@@ -2667,17 +2699,17 @@ extern "C" __device__ double test_nextafter(double x, double y) {
 
 // DEFAULT-LABEL: @test_norm3df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm3df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm3df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_norm3df(float x, float y, float z) {
@@ -2686,17 +2718,17 @@ extern "C" __device__ float test_norm3df(float x, float y, float z) {
 
 // DEFAULT-LABEL: @test_norm3d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm3d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm3d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_norm3d(double x, double y, double z) {
@@ -2705,17 +2737,17 @@ extern "C" __device__ double test_norm3d(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_norm4df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm4df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_len4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm4df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_norm4df(float x, float y, float z, float w) {
@@ -2724,17 +2756,17 @@ extern "C" __device__ float test_norm4df(float x, float y, float z, float w) {
 
 // DEFAULT-LABEL: @test_norm4d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_norm4d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_len4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_norm4d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_norm4d(double x, double y, double z, double w) {
@@ -2743,17 +2775,17 @@ extern "C" __device__ double test_norm4d(double x, double y, double z, double w)
 
 // DEFAULT-LABEL: @test_normcdff(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdff(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdf_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdff(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_normcdff(float x) {
@@ -2762,17 +2794,17 @@ extern "C" __device__ float test_normcdff(float x) {
 
 // DEFAULT-LABEL: @test_normcdf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdf_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_normcdf(double x) {
@@ -2781,17 +2813,17 @@ extern "C" __device__ double test_normcdf(double x) {
 
 // DEFAULT-LABEL: @test_normcdfinvf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdfinvf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_ncdfinv_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdfinvf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_normcdfinvf(float x) {
@@ -2800,17 +2832,17 @@ extern "C" __device__ float test_normcdfinvf(float x) {
 
 // DEFAULT-LABEL: @test_normcdfinv(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_normcdfinv(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_ncdfinv_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_normcdfinv(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_normcdfinv(double x) {
@@ -2834,7 +2866,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // DEFAULT:       _ZL5normfiPKf.exit:
 // DEFAULT-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
+// DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
 // DEFAULT-NEXT:    ret float [[TMP1]]
 //
 // FINITEONLY-LABEL: @test_normf(
@@ -2854,7 +2886,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // FINITEONLY:       _ZL5normfiPKf.exit:
 // FINITEONLY-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
+// FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
 // FINITEONLY-NEXT:    ret float [[TMP1]]
 //
 // APPROX-LABEL: @test_normf(
@@ -2874,7 +2906,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // APPROX:       _ZL5normfiPKf.exit:
 // APPROX-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[TMP1:%.*]] = tail call contract float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
+// APPROX-NEXT:    [[TMP1:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
 // APPROX-NEXT:    ret float [[TMP1]]
 //
 extern "C" __device__ float test_normf(int x, const float *y) {
@@ -2898,7 +2930,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // DEFAULT:       _ZL4normiPKd.exit:
 // DEFAULT-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
+// DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
 // DEFAULT-NEXT:    ret double [[TMP1]]
 //
 // FINITEONLY-LABEL: @test_norm(
@@ -2918,7 +2950,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // FINITEONLY:       _ZL4normiPKd.exit:
 // FINITEONLY-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
+// FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
 // FINITEONLY-NEXT:    ret double [[TMP1]]
 //
 // APPROX-LABEL: @test_norm(
@@ -2938,7 +2970,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // APPROX:       _ZL4normiPKd.exit:
 // APPROX-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[TMP1:%.*]] = tail call contract double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
+// APPROX-NEXT:    [[TMP1:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
 // APPROX-NEXT:    ret double [[TMP1]]
 //
 extern "C" __device__ double test_norm(int x, const double *y) {
@@ -2947,17 +2979,17 @@ extern "C" __device__ double test_norm(int x, const double *y) {
 
 // DEFAULT-LABEL: @test_powf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_powf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_powf(float x, float y) {
@@ -2966,17 +2998,17 @@ extern "C" __device__ float test_powf(float x, float y) {
 
 // DEFAULT-LABEL: @test_pow(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_pow(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pow_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_pow(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_pow(double x, double y) {
@@ -2985,17 +3017,17 @@ extern "C" __device__ double test_pow(double x, double y) {
 
 // DEFAULT-LABEL: @test_powif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pown_f32(float noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_powif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_powif(float x, int y) {
@@ -3004,17 +3036,17 @@ extern "C" __device__ float test_powif(float x, int y) {
 
 // DEFAULT-LABEL: @test_powi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_powi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_pown_f64(double noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_powi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_powi(double x, int y) {
@@ -3023,17 +3055,17 @@ extern "C" __device__ double test_powi(double x, int y) {
 
 // DEFAULT-LABEL: @test_rcbrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rcbrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rcbrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rcbrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rcbrtf(float x) {
@@ -3042,17 +3074,17 @@ extern "C" __device__ float test_rcbrtf(float x) {
 
 // DEFAULT-LABEL: @test_rcbrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rcbrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rcbrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rcbrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rcbrt(double x) {
@@ -3061,17 +3093,17 @@ extern "C" __device__ double test_rcbrt(double x) {
 
 // DEFAULT-LABEL: @test_remainderf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remainderf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remainder_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_remainderf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_remainderf(float x, float y) {
@@ -3080,17 +3112,17 @@ extern "C" __device__ float test_remainderf(float x, float y) {
 
 // DEFAULT-LABEL: @test_remainder(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_remainder(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remainder_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_remainder(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_remainder(double x, double y) {
@@ -3101,7 +3133,7 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -3111,7 +3143,7 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -3121,7 +3153,7 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -3135,7 +3167,7 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -3145,7 +3177,7 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -3155,7 +3187,7 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
-// APPROX-NEXT:    [[CALL_I:%.*]] = call contract double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[TBAA12]]
 // APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA12]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR17]]
@@ -3167,17 +3199,17 @@ extern "C" __device__ double test_remquo(double x, double y, int* z) {
 
 // DEFAULT-LABEL: @test_rhypotf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rhypotf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rhypot_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rhypotf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rhypotf(float x, float y) {
@@ -3186,17 +3218,17 @@ extern "C" __device__ float test_rhypotf(float x, float y) {
 
 // DEFAULT-LABEL: @test_rhypot(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rhypot(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rhypot_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rhypot(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rhypot(double x, double y) {
@@ -3205,17 +3237,17 @@ extern "C" __device__ double test_rhypot(double x, double y) {
 
 // DEFAULT-LABEL: @test_rintf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_rintf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.rint.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_rintf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.rint.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_rintf(float x) {
@@ -3224,17 +3256,17 @@ extern "C" __device__ float test_rintf(float x) {
 
 // DEFAULT-LABEL: @test_rint(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_rint(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.rint.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_rint(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.rint.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_rint(double x) {
@@ -3258,7 +3290,7 @@ extern "C" __device__ double test_rint(double x) {
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // DEFAULT:       _ZL6rnormfiPKf.exit:
 // DEFAULT-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rsqrt_f32(float noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnormf(
@@ -3278,7 +3310,7 @@ extern "C" __device__ double test_rint(double x) {
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // FINITEONLY:       _ZL6rnormfiPKf.exit:
 // FINITEONLY-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_LCSSA_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnormf(
@@ -3298,7 +3330,7 @@ extern "C" __device__ double test_rint(double x) {
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // APPROX:       _ZL6rnormfiPKf.exit:
 // APPROX-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rsqrt_f32(float noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnormf(int x, const float* y) {
@@ -3322,7 +3354,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // DEFAULT:       _ZL5rnormiPKd.exit:
 // DEFAULT-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rsqrt_f64(double noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm(
@@ -3342,7 +3374,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // FINITEONLY:       _ZL5rnormiPKd.exit:
 // FINITEONLY-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_LCSSA_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm(
@@ -3362,7 +3394,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // APPROX:       _ZL5rnormiPKd.exit:
 // APPROX-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rsqrt_f64(double noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm(int x, const double* y) {
@@ -3371,17 +3403,17 @@ extern "C" __device__ double test_rnorm(int x, const double* y) {
 
 // DEFAULT-LABEL: @test_rnorm3df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm3df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen3_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm3df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnorm3df(float x, float y, float z) {
@@ -3390,17 +3422,17 @@ extern "C" __device__ float test_rnorm3df(float x, float y, float z) {
 
 // DEFAULT-LABEL: @test_rnorm3d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm3d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen3_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm3d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm3d(double x, double y, double z) {
@@ -3409,17 +3441,17 @@ extern "C" __device__ double test_rnorm3d(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_rnorm4df(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm4df(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rlen4_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]], float noundef nofpclass(nan inf) [[Z:%.*]], float noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm4df(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) {
@@ -3428,17 +3460,17 @@ extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) {
 
 // DEFAULT-LABEL: @test_rnorm4d(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm4d(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rlen4_f64(double noundef nofpclass(nan inf) [[X:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]], double noundef nofpclass(nan inf) [[W:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm4d(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w) {
@@ -3447,17 +3479,17 @@ extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w
 
 // DEFAULT-LABEL: @test_roundf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_roundf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.round.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_roundf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.round.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_roundf(float x) {
@@ -3466,17 +3498,17 @@ extern "C" __device__ float test_roundf(float x) {
 
 // DEFAULT-LABEL: @test_round(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_round(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.round.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_round(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.round.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_round(double x) {
@@ -3485,17 +3517,17 @@ extern "C" __device__ double test_round(double x) {
 
 // DEFAULT-LABEL: @test_rsqrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rsqrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rsqrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rsqrtf(float x) {
@@ -3504,17 +3536,17 @@ extern "C" __device__ float test_rsqrtf(float x) {
 
 // DEFAULT-LABEL: @test_rsqrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rsqrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rsqrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rsqrt(double x) {
@@ -3621,17 +3653,17 @@ extern "C" __device__ double test_scalbln(double x, long int y) {
 
 // DEFAULT-LABEL: @test_scalbnf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_scalbnf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_scalbnf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_scalbnf(float x, int y) {
@@ -3640,17 +3672,17 @@ extern "C" __device__ float test_scalbnf(float x, int y) {
 
 // DEFAULT-LABEL: @test_scalbn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_scalbn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_scalbn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_scalbn(double x, int y) {
@@ -3828,17 +3860,17 @@ extern "C" __device__ void test_sincospi(double x, double *y, double *z) {
 
 // DEFAULT-LABEL: @test_sinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I_I]]
 //
 extern "C" __device__ float test_sinf(float x) {
@@ -3847,17 +3879,17 @@ extern "C" __device__ float test_sinf(float x) {
 
 // DEFAULT-LABEL: @test_sin(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sin(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sin_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_sin(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_sin(double x) {
@@ -3866,17 +3898,17 @@ extern "C" __device__ double test_sin(double x) {
 
 // DEFAULT-LABEL: @test_sinpif(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinpif(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sinpi_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinpif(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_sinpif(float x) {
@@ -3885,17 +3917,17 @@ extern "C" __device__ float test_sinpif(float x) {
 
 // DEFAULT-LABEL: @test_sinpi(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_sinpi(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_sinpi_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_sinpi(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_sinpi(double x) {
@@ -3904,17 +3936,17 @@ extern "C" __device__ double test_sinpi(double x) {
 
 // DEFAULT-LABEL: @test_sqrtf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.sqrt.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_sqrtf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.sqrt.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_sqrtf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.sqrt.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_sqrtf(float x) {
@@ -3923,17 +3955,17 @@ extern "C" __device__ float test_sqrtf(float x) {
 
 // DEFAULT-LABEL: @test_sqrt(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.sqrt.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_sqrt(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.sqrt.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_sqrt(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.sqrt.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_sqrt(double x) {
@@ -3942,17 +3974,17 @@ extern "C" __device__ double test_sqrt(double x) {
 
 // DEFAULT-LABEL: @test_tanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tan_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tanf(float x) {
@@ -3961,17 +3993,17 @@ extern "C" __device__ float test_tanf(float x) {
 
 // DEFAULT-LABEL: @test_tan(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tan(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tan_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tan(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tan(double x) {
@@ -3980,17 +4012,17 @@ extern "C" __device__ double test_tan(double x) {
 
 // DEFAULT-LABEL: @test_tanhf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanhf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tanh_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanhf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tanhf(float x) {
@@ -3999,17 +4031,17 @@ extern "C" __device__ float test_tanhf(float x) {
 
 // DEFAULT-LABEL: @test_tanh(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tanh(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tanh_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tanh(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tanh(double x) {
@@ -4018,17 +4050,17 @@ extern "C" __device__ double test_tanh(double x) {
 
 // DEFAULT-LABEL: @test_tgammaf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tgammaf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_tgamma_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_tgammaf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_tgammaf(float x) {
@@ -4037,17 +4069,17 @@ extern "C" __device__ float test_tgammaf(float x) {
 
 // DEFAULT-LABEL: @test_tgamma(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_tgamma(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_tgamma_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_tgamma(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_tgamma(double x) {
@@ -4056,17 +4088,17 @@ extern "C" __device__ double test_tgamma(double x) {
 
 // DEFAULT-LABEL: @test_truncf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.trunc.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_truncf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.trunc.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.trunc.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_truncf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.trunc.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_truncf(float x) {
@@ -4075,17 +4107,17 @@ extern "C" __device__ float test_truncf(float x) {
 
 // DEFAULT-LABEL: @test_trunc(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.trunc.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_trunc(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.trunc.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.trunc.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_trunc(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.trunc.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_trunc(double x) {
@@ -4094,17 +4126,17 @@ extern "C" __device__ double test_trunc(double x) {
 
 // DEFAULT-LABEL: @test_y0f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y0f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_y0f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_y0f(float x) {
@@ -4113,17 +4145,17 @@ extern "C" __device__ float test_y0f(float x) {
 
 // DEFAULT-LABEL: @test_y0(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y0(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_y0(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_y0(double x) {
@@ -4132,17 +4164,17 @@ extern "C" __device__ double test_y0(double x) {
 
 // DEFAULT-LABEL: @test_y1f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y1f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_y1f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_y1f(float x) {
@@ -4151,17 +4183,17 @@ extern "C" __device__ float test_y1f(float x) {
 
 // DEFAULT-LABEL: @test_y1(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_y1(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_y1(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_y1(double x) {
@@ -4175,14 +4207,14 @@ extern "C" __device__ double test_y1(double x) {
 // DEFAULT-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // DEFAULT:       for.body.i:
@@ -4208,14 +4240,14 @@ extern "C" __device__ double test_y1(double x) {
 // FINITEONLY-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -4241,14 +4273,14 @@ extern "C" __device__ double test_y1(double x) {
 // APPROX-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // APPROX:       for.body.i:
@@ -4278,14 +4310,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // DEFAULT-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // DEFAULT-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // DEFAULT:       for.body.i:
@@ -4311,14 +4343,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // FINITEONLY-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // FINITEONLY-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // FINITEONLY:       for.body.i:
@@ -4344,14 +4376,14 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // APPROX-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL2YNID_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
 // APPROX-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // APPROX:       for.body.i:
@@ -4376,17 +4408,17 @@ extern "C" __device__ double test_yn(int x, double y) {
 
 // DEFAULT-LABEL: @test___cosf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___cosf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___cosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___cosf(float x) {
@@ -4396,19 +4428,19 @@ extern "C" __device__ float test___cosf(float x) {
 // DEFAULT-LABEL: @test___exp10f(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x400A934F00000000
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test___exp10f(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[X:%.*]], 0x400A934F00000000
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test___exp10f(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x400A934F00000000
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___exp10f(float x) {
@@ -4418,19 +4450,19 @@ extern "C" __device__ float test___exp10f(float x) {
 // DEFAULT-LABEL: @test___expf(
 // DEFAULT-NEXT:  entry:
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x3FF7154760000000
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test___expf(
 // FINITEONLY-NEXT:  entry:
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[X:%.*]], 0x3FF7154760000000
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test___expf(
 // APPROX-NEXT:  entry:
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x3FF7154760000000
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___expf(float x) {
@@ -4477,17 +4509,17 @@ extern "C" __device__ float test___fdividef(float x, float y) {
 
 // DEFAULT-LABEL: @test__fmaf_rn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test__fmaf_rn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test__fmaf_rn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test__fmaf_rn(float x, float y, float z) {
@@ -4534,17 +4566,17 @@ extern "C" __device__ float test___frcp_rn(float x) {
 
 // DEFAULT-LABEL: @test___frsqrt_rn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test___frsqrt_rn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test___frsqrt_rn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___frsqrt_rn(float x) {
@@ -4553,17 +4585,17 @@ extern "C" __device__ float test___frsqrt_rn(float x) {
 
 // DEFAULT-LABEL: @test___fsqrt_rn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___fsqrt_rn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sqrt_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR14]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___fsqrt_rn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR14]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsqrt_rn(float x) {
@@ -4591,17 +4623,17 @@ extern "C" __device__ float test___fsub_rn(float x, float y) {
 
 // DEFAULT-LABEL: @test___log10f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.log10.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test___log10f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.log10.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test___log10f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.log10.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___log10f(float x) {
@@ -4610,17 +4642,17 @@ extern "C" __device__ float test___log10f(float x) {
 
 // DEFAULT-LABEL: @test___log2f(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test___log2f(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test___log2f(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.log.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___log2f(float x) {
@@ -4629,17 +4661,17 @@ extern "C" __device__ float test___log2f(float x) {
 
 // DEFAULT-LABEL: @test___logf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.log.f32(float [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test___logf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.log.f32(float [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float [[X:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test___logf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.log.f32(float [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test___logf(float x) {
@@ -4648,17 +4680,17 @@ extern "C" __device__ float test___logf(float x) {
 
 // DEFAULT-LABEL: @test___powf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___powf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_pow_f32(float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___powf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___powf(float x, float y) {
@@ -4723,17 +4755,17 @@ extern "C" __device__ void test___sincosf(float x, float *y, float *z) {
 
 // DEFAULT-LABEL: @test___sinf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test___sinf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test___sinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___sinf(float x) {
@@ -4742,24 +4774,24 @@ extern "C" __device__ float test___sinf(float x) {
 
 // DEFAULT-LABEL: @test___tanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I3_I]])
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I_I]], [[TMP0]]
 // DEFAULT-NEXT:    ret float [[MUL_I]]
 //
 // FINITEONLY-LABEL: @test___tanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.rcp.f32(float [[CALL_I3_I]])
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[CALL_I_I]], [[TMP0]]
 // FINITEONLY-NEXT:    ret float [[MUL_I]]
 //
 // APPROX-LABEL: @test___tanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I3_I]])
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I_I]], [[TMP0]]
 // APPROX-NEXT:    ret float [[MUL_I]]
@@ -4846,17 +4878,17 @@ extern "C" __device__ double test___drcp_rn(double x) {
 
 // DEFAULT-LABEL: @test___dsqrt_rn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.sqrt.f64(double [[X:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test___dsqrt_rn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.sqrt.f64(double [[X:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test___dsqrt_rn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.sqrt.f64(double [[X:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test___dsqrt_rn(double x) {
@@ -4865,17 +4897,17 @@ extern "C" __device__ double test___dsqrt_rn(double x) {
 
 // DEFAULT-LABEL: @test__fma_rn(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test__fma_rn(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test__fma_rn(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test__fma_rn(double x, double y, double z) {
@@ -4884,17 +4916,17 @@ extern "C" __device__ double test__fma_rn(double x, double y, double z) {
 
 // DEFAULT-LABEL: @test_float_min(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_float_min(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_float_min(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_float_min(float x, float y) {
@@ -4903,17 +4935,17 @@ extern "C" __device__ float test_float_min(float x, float y) {
 
 // DEFAULT-LABEL: @test_float_max(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // DEFAULT-NEXT:    ret float [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_float_max(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // FINITEONLY-NEXT:    ret float [[TMP0]]
 //
 // APPROX-LABEL: @test_float_max(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
 // APPROX-NEXT:    ret float [[TMP0]]
 //
 extern "C" __device__ float test_float_max(float x, float y) {
@@ -4922,17 +4954,17 @@ extern "C" __device__ float test_float_max(float x, float y) {
 
 // DEFAULT-LABEL: @test_double_min(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_double_min(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_double_min(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_double_min(double x, double y) {
@@ -4941,17 +4973,17 @@ extern "C" __device__ double test_double_min(double x, double y) {
 
 // DEFAULT-LABEL: @test_double_max(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // DEFAULT-NEXT:    ret double [[TMP0]]
 //
 // FINITEONLY-LABEL: @test_double_max(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // FINITEONLY-NEXT:    ret double [[TMP0]]
 //
 // APPROX-LABEL: @test_double_max(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
 // APPROX-NEXT:    ret double [[TMP0]]
 //
 extern "C" __device__ double test_double_max(double x, double y) {
@@ -4959,7 +4991,7 @@ extern "C" __device__ double test_double_max(double x, double y) {
 }
 // CHECK-LABEL: @test_int_min(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COND_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+// CHECK-NEXT:    [[COND_I:%.*]] = tail call noundef i32 @llvm.smin.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
 // CHECK-NEXT:    ret i32 [[COND_I]]
 //
 extern "C" __device__ int test_int_min(int x, int y) {
@@ -4968,7 +5000,7 @@ extern "C" __device__ int test_int_min(int x, int y) {
 
 // CHECK-LABEL: @test_int_max(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COND_I:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+// CHECK-NEXT:    [[COND_I:%.*]] = tail call noundef i32 @llvm.smax.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
 // CHECK-NEXT:    ret i32 [[COND_I]]
 //
 extern "C" __device__ int test_int_max(int x, int y) {
diff --git a/clang/test/Headers/__clang_hip_math_ocml_rounded_ops.hip b/clang/test/Headers/__clang_hip_math_ocml_rounded_ops.hip
index 422b5fb56c4d40..9e30c2c9c2ccbb 100644
--- a/clang/test/Headers/__clang_hip_math_ocml_rounded_ops.hip
+++ b/clang/test/Headers/__clang_hip_math_ocml_rounded_ops.hip
@@ -9,7 +9,7 @@
 
 // CHECK-LABEL: @test___fadd_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_add_rtn_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_add_rtn_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3:[0-9]+]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fadd_rd(float x, float y) {
@@ -18,7 +18,7 @@ extern "C" __device__ float test___fadd_rd(float x, float y) {
 
 // CHECK-LABEL: @test___fadd_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_add_rte_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_add_rte_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fadd_rn(float x, float y) {
@@ -27,7 +27,7 @@ extern "C" __device__ float test___fadd_rn(float x, float y) {
 
 // CHECK-LABEL: @test___fadd_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_add_rtp_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_add_rtp_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fadd_ru(float x, float y) {
@@ -36,7 +36,7 @@ extern "C" __device__ float test___fadd_ru(float x, float y) {
 
 // CHECK-LABEL: @test___fadd_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_add_rtz_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_add_rtz_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fadd_rz(float x, float y) {
@@ -45,7 +45,7 @@ extern "C" __device__ float test___fadd_rz(float x, float y) {
 
 // CHECK-LABEL: @test__fmaf_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_fma_rtn_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fma_rtn_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test__fmaf_rd(float x, float y, float z) {
@@ -54,7 +54,7 @@ extern "C" __device__ float test__fmaf_rd(float x, float y, float z) {
 
 // CHECK-LABEL: @test__fmaf_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_fma_rte_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fma_rte_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test__fmaf_rn(float x, float y, float z) {
@@ -63,7 +63,7 @@ extern "C" __device__ float test__fmaf_rn(float x, float y, float z) {
 
 // CHECK-LABEL: @test__fmaf_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_fma_rtp_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fma_rtp_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test__fmaf_ru(float x, float y, float z) {
@@ -72,7 +72,7 @@ extern "C" __device__ float test__fmaf_ru(float x, float y, float z) {
 
 // CHECK-LABEL: @test__fmaf_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_fma_rtz_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fma_rtz_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test__fmaf_rz(float x, float y, float z) {
@@ -81,7 +81,7 @@ extern "C" __device__ float test__fmaf_rz(float x, float y, float z) {
 
 // CHECK-LABEL: @test___fmul_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_mul_rtn_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_mul_rtn_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fmul_rd(float x, float y) {
@@ -90,7 +90,7 @@ extern "C" __device__ float test___fmul_rd(float x, float y) {
 
 // CHECK-LABEL: @test___fmul_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_mul_rte_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_mul_rte_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fmul_rn(float x, float y) {
@@ -99,7 +99,7 @@ extern "C" __device__ float test___fmul_rn(float x, float y) {
 
 // CHECK-LABEL: @test___fmul_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_mul_rtp_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_mul_rtp_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fmul_ru(float x, float y) {
@@ -108,7 +108,7 @@ extern "C" __device__ float test___fmul_ru(float x, float y) {
 
 // CHECK-LABEL: @test___fmul_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_mul_rtz_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_mul_rtz_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fmul_rz(float x, float y) {
@@ -117,7 +117,7 @@ extern "C" __device__ float test___fmul_rz(float x, float y) {
 
 // CHECK-LABEL: @test___frcp_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_div_rtn_f32(float noundef 1.000000e+00, float noundef [[X:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_div_rtn_f32(float noundef 1.000000e+00, float noundef [[X:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___frcp_rd(float x) {
@@ -126,7 +126,7 @@ extern "C" __device__ float test___frcp_rd(float x) {
 
 // CHECK-LABEL: @test___frcp_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_div_rte_f32(float noundef 1.000000e+00, float noundef [[X:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_div_rte_f32(float noundef 1.000000e+00, float noundef [[X:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___frcp_rn(float x) {
@@ -135,7 +135,7 @@ extern "C" __device__ float test___frcp_rn(float x) {
 
 // CHECK-LABEL: @test___frcp_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_div_rtp_f32(float noundef 1.000000e+00, float noundef [[X:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_div_rtp_f32(float noundef 1.000000e+00, float noundef [[X:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___frcp_ru(float x) {
@@ -144,7 +144,7 @@ extern "C" __device__ float test___frcp_ru(float x) {
 
 // CHECK-LABEL: @test___frcp_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_div_rtz_f32(float noundef 1.000000e+00, float noundef [[X:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_div_rtz_f32(float noundef 1.000000e+00, float noundef [[X:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___frcp_rz(float x) {
@@ -153,7 +153,7 @@ extern "C" __device__ float test___frcp_rz(float x) {
 
 // CHECK-LABEL: @test___fsqrt_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sqrt_rtn_f32(float noundef [[X:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sqrt_rtn_f32(float noundef [[X:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsqrt_rd(float x) {
@@ -162,7 +162,7 @@ extern "C" __device__ float test___fsqrt_rd(float x) {
 
 // CHECK-LABEL: @test___fsqrt_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sqrt_rte_f32(float noundef [[X:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sqrt_rte_f32(float noundef [[X:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsqrt_rn(float x) {
@@ -171,7 +171,7 @@ extern "C" __device__ float test___fsqrt_rn(float x) {
 
 // CHECK-LABEL: @test___fsqrt_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sqrt_rtp_f32(float noundef [[X:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sqrt_rtp_f32(float noundef [[X:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsqrt_ru(float x) {
@@ -180,7 +180,7 @@ extern "C" __device__ float test___fsqrt_ru(float x) {
 
 // CHECK-LABEL: @test___fsqrt_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sqrt_rtz_f32(float noundef [[X:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sqrt_rtz_f32(float noundef [[X:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsqrt_rz(float x) {
@@ -189,7 +189,7 @@ extern "C" __device__ float test___fsqrt_rz(float x) {
 
 // CHECK-LABEL: @test___fsub_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sub_rtn_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sub_rtn_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsub_rd(float x, float y) {
@@ -198,7 +198,7 @@ extern "C" __device__ float test___fsub_rd(float x, float y) {
 
 // CHECK-LABEL: @test___fsub_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sub_rte_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sub_rte_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsub_rn(float x, float y) {
@@ -207,7 +207,7 @@ extern "C" __device__ float test___fsub_rn(float x, float y) {
 
 // CHECK-LABEL: @test___fsub_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sub_rtp_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sub_rtp_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsub_ru(float x, float y) {
@@ -216,7 +216,7 @@ extern "C" __device__ float test___fsub_ru(float x, float y) {
 
 // CHECK-LABEL: @test___fsub_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract float @__ocml_sub_rtz_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sub_rtz_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test___fsub_rz(float x, float y) {
@@ -225,7 +225,7 @@ extern "C" __device__ float test___fsub_rz(float x, float y) {
 
 // CHECK-LABEL: @test___dadd_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_add_rtn_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_add_rtn_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test___dadd_rd(double x, double y) {
@@ -234,7 +234,7 @@ extern "C" __device__ double test___dadd_rd(double x, double y) {
 
 // CHECK-LABEL: @test___dadd_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_add_rte_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_add_rte_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test___dadd_rn(double x, double y) {
@@ -243,7 +243,7 @@ extern "C" __device__ double test___dadd_rn(double x, double y) {
 
 // CHECK-LABEL: @test___dadd_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_add_rtp_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_add_rtp_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test___dadd_ru(double x, double y) {
@@ -252,7 +252,7 @@ extern "C" __device__ double test___dadd_ru(double x, double y) {
 
 // CHECK-LABEL: @test___dadd_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_add_rtz_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_add_rtz_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test___dadd_rz(double x, double y) {
@@ -261,7 +261,7 @@ extern "C" __device__ double test___dadd_rz(double x, double y) {
 
 // CHECK-LABEL: @test___dmul_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_mul_rtn_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_mul_rtn_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test___dmul_rd(double x, double y) {
@@ -270,7 +270,7 @@ extern "C" __device__ double test___dmul_rd(double x, double y) {
 
 // CHECK-LABEL: @test___dmul_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_mul_rte_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_mul_rte_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test___dmul_rn(double x, double y) {
@@ -279,7 +279,7 @@ extern "C" __device__ double test___dmul_rn(double x, double y) {
 
 // CHECK-LABEL: @test___dmul_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_mul_rtp_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_mul_rtp_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test___dmul_ru(double x, double y) {
@@ -288,7 +288,7 @@ extern "C" __device__ double test___dmul_ru(double x, double y) {
 
 // CHECK-LABEL: @test___dmul_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_mul_rtz_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_mul_rtz_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test___dmul_rz(double x, double y) {
@@ -298,7 +298,7 @@ extern "C" __device__ double test___dmul_rz(double x, double y) {
 // CHECK-LABEL: @test___drcp_rd(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X:%.*]] to double
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_div_rtn_f64(double noundef 1.000000e+00, double noundef [[CONV]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_div_rtn_f64(double noundef 1.000000e+00, double noundef [[CONV]]) #[[ATTR3]]
 // CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL_I]] to float
 // CHECK-NEXT:    ret float [[CONV1]]
 //
@@ -309,7 +309,7 @@ extern "C" __device__ float test___drcp_rd(float x) {
 // CHECK-LABEL: @test___drcp_rn(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X:%.*]] to double
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_div_rte_f64(double noundef 1.000000e+00, double noundef [[CONV]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_div_rte_f64(double noundef 1.000000e+00, double noundef [[CONV]]) #[[ATTR3]]
 // CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL_I]] to float
 // CHECK-NEXT:    ret float [[CONV1]]
 //
@@ -320,7 +320,7 @@ extern "C" __device__ float test___drcp_rn(float x) {
 // CHECK-LABEL: @test___drcp_ru(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X:%.*]] to double
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_div_rtp_f64(double noundef 1.000000e+00, double noundef [[CONV]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_div_rtp_f64(double noundef 1.000000e+00, double noundef [[CONV]]) #[[ATTR3]]
 // CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL_I]] to float
 // CHECK-NEXT:    ret float [[CONV1]]
 //
@@ -331,7 +331,7 @@ extern "C" __device__ float test___drcp_ru(float x) {
 // CHECK-LABEL: @test___drcp_rz(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X:%.*]] to double
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_div_rtz_f64(double noundef 1.000000e+00, double noundef [[CONV]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_div_rtz_f64(double noundef 1.000000e+00, double noundef [[CONV]]) #[[ATTR3]]
 // CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL_I]] to float
 // CHECK-NEXT:    ret float [[CONV1]]
 //
@@ -342,7 +342,7 @@ extern "C" __device__ float test___drcp_rz(float x) {
 // CHECK-LABEL: @test___dsqrt_rd(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X:%.*]] to double
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_sqrt_rtn_f64(double noundef [[CONV]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sqrt_rtn_f64(double noundef [[CONV]]) #[[ATTR3]]
 // CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL_I]] to float
 // CHECK-NEXT:    ret float [[CONV1]]
 //
@@ -353,7 +353,7 @@ extern "C" __device__ float test___dsqrt_rd(float x) {
 // CHECK-LABEL: @test___dsqrt_rn(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X:%.*]] to double
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_sqrt_rte_f64(double noundef [[CONV]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sqrt_rte_f64(double noundef [[CONV]]) #[[ATTR3]]
 // CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL_I]] to float
 // CHECK-NEXT:    ret float [[CONV1]]
 //
@@ -364,7 +364,7 @@ extern "C" __device__ float test___dsqrt_rn(float x) {
 // CHECK-LABEL: @test___dsqrt_ru(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X:%.*]] to double
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_sqrt_rtp_f64(double noundef [[CONV]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sqrt_rtp_f64(double noundef [[CONV]]) #[[ATTR3]]
 // CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL_I]] to float
 // CHECK-NEXT:    ret float [[CONV1]]
 //
@@ -375,7 +375,7 @@ extern "C" __device__ float test___dsqrt_ru(float x) {
 // CHECK-LABEL: @test___dsqrt_rz(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X:%.*]] to double
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_sqrt_rtz_f64(double noundef [[CONV]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sqrt_rtz_f64(double noundef [[CONV]]) #[[ATTR3]]
 // CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL_I]] to float
 // CHECK-NEXT:    ret float [[CONV1]]
 //
@@ -385,7 +385,7 @@ extern "C" __device__ float test___dsqrt_rz(float x) {
 
 // CHECK-LABEL: @test__fma_rd(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_fma_rtn_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fma_rtn_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test__fma_rd(double x, double y, double z) {
@@ -394,7 +394,7 @@ extern "C" __device__ double test__fma_rd(double x, double y, double z) {
 
 // CHECK-LABEL: @test__fma_rn(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_fma_rte_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fma_rte_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test__fma_rn(double x, double y, double z) {
@@ -403,7 +403,7 @@ extern "C" __device__ double test__fma_rn(double x, double y, double z) {
 
 // CHECK-LABEL: @test__fma_ru(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_fma_rtp_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fma_rtp_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test__fma_ru(double x, double y, double z) {
@@ -412,7 +412,7 @@ extern "C" __device__ double test__fma_ru(double x, double y, double z) {
 
 // CHECK-LABEL: @test__fma_rz(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract double @__ocml_fma_rtz_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fma_rtz_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR3]]
 // CHECK-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test__fma_rz(double x, double y, double z) {
diff --git a/clang/test/Headers/amdgcn_openmp_device_math.c b/clang/test/Headers/amdgcn_openmp_device_math.c
index 296f89377e2ace..243d241628b2a0 100644
--- a/clang/test/Headers/amdgcn_openmp_device_math.c
+++ b/clang/test/Headers/amdgcn_openmp_device_math.c
@@ -113,21 +113,21 @@
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I]] to ptr
 // CHECK-CPP-NEXT:    store double [[TMP0]], ptr [[__X_ADDR_ASCAST_I]], align 8
 // CHECK-CPP-NEXT:    [[TMP1:%.*]] = load double, ptr [[__X_ADDR_ASCAST_I]], align 8
-// CHECK-CPP-NEXT:    [[CALL_I:%.*]] = call double @__ocml_sin_f64(double noundef [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-CPP-NEXT:    [[CALL_I:%.*]] = call noundef double @__ocml_sin_f64(double noundef [[TMP1]]) #[[ATTR3:[0-9]+]]
 // CHECK-CPP-NEXT:    store double [[CALL_I]], ptr [[L1_ASCAST]], align 8
 // CHECK-CPP-NEXT:    [[TMP2:%.*]] = load double, ptr [[X_ADDR_ASCAST]], align 8
 // CHECK-CPP-NEXT:    [[RETVAL_ASCAST_I6:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I4]] to ptr
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I7:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I5]] to ptr
 // CHECK-CPP-NEXT:    store double [[TMP2]], ptr [[__X_ADDR_ASCAST_I7]], align 8
 // CHECK-CPP-NEXT:    [[TMP3:%.*]] = load double, ptr [[__X_ADDR_ASCAST_I7]], align 8
-// CHECK-CPP-NEXT:    [[CALL_I8:%.*]] = call double @__ocml_cos_f64(double noundef [[TMP3]]) #[[ATTR3]]
+// CHECK-CPP-NEXT:    [[CALL_I8:%.*]] = call noundef double @__ocml_cos_f64(double noundef [[TMP3]]) #[[ATTR3]]
 // CHECK-CPP-NEXT:    store double [[CALL_I8]], ptr [[L2_ASCAST]], align 8
 // CHECK-CPP-NEXT:    [[TMP4:%.*]] = load double, ptr [[X_ADDR_ASCAST]], align 8
 // CHECK-CPP-NEXT:    [[RETVAL_ASCAST_I11:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I9]] to ptr
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I12:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I10]] to ptr
 // CHECK-CPP-NEXT:    store double [[TMP4]], ptr [[__X_ADDR_ASCAST_I12]], align 8
 // CHECK-CPP-NEXT:    [[TMP5:%.*]] = load double, ptr [[__X_ADDR_ASCAST_I12]], align 8
-// CHECK-CPP-NEXT:    [[TMP6:%.*]] = call double @llvm.fabs.f64(double [[TMP5]])
+// CHECK-CPP-NEXT:    [[TMP6:%.*]] = call noundef double @llvm.fabs.f64(double [[TMP5]])
 // CHECK-CPP-NEXT:    store double [[TMP6]], ptr [[L3_ASCAST]], align 8
 // CHECK-CPP-NEXT:    [[TMP7:%.*]] = load double, ptr [[X_ADDR_ASCAST]], align 8
 // CHECK-CPP-NEXT:    [[TMP8:%.*]] = load double, ptr [[Y_ADDR_ASCAST]], align 8
@@ -142,7 +142,7 @@
 // CHECK-CPP-NEXT:    [[TMP10:%.*]] = load double, ptr [[__X_ADDR_ASCAST_I16]], align 8
 // CHECK-CPP-NEXT:    [[TMP11:%.*]] = load double, ptr [[__Y_ADDR_ASCAST_I]], align 8
 // CHECK-CPP-NEXT:    [[TMP12:%.*]] = load double, ptr [[__Z_ADDR_ASCAST_I]], align 8
-// CHECK-CPP-NEXT:    [[TMP13:%.*]] = call double @llvm.fma.f64(double [[TMP10]], double [[TMP11]], double [[TMP12]])
+// CHECK-CPP-NEXT:    [[TMP13:%.*]] = call noundef double @llvm.fma.f64(double [[TMP10]], double [[TMP11]], double [[TMP12]])
 // CHECK-CPP-NEXT:    store double [[TMP13]], ptr [[L4_ASCAST]], align 8
 // CHECK-CPP-NEXT:    ret void
 //
@@ -278,7 +278,7 @@ void test_math_f64(double x, double y, double z) {
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I22:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I20]] to ptr
 // CHECK-CPP-NEXT:    store float [[TMP1]], ptr [[__X_ADDR_ASCAST_I22]], align 4
 // CHECK-CPP-NEXT:    [[TMP2:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I22]], align 4
-// CHECK-CPP-NEXT:    [[CALL_I23:%.*]] = call float @__ocml_sin_f32(float noundef [[TMP2]]) #[[ATTR3]]
+// CHECK-CPP-NEXT:    [[CALL_I23:%.*]] = call noundef float @__ocml_sin_f32(float noundef [[TMP2]]) #[[ATTR3]]
 // CHECK-CPP-NEXT:    store float [[CALL_I23]], ptr [[L1_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[TMP3:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[RETVAL_ASCAST_I6:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I4]] to ptr
@@ -289,7 +289,7 @@ void test_math_f64(double x, double y, double z) {
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I27:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I25]] to ptr
 // CHECK-CPP-NEXT:    store float [[TMP4]], ptr [[__X_ADDR_ASCAST_I27]], align 4
 // CHECK-CPP-NEXT:    [[TMP5:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I27]], align 4
-// CHECK-CPP-NEXT:    [[CALL_I:%.*]] = call float @__ocml_cos_f32(float noundef [[TMP5]]) #[[ATTR3]]
+// CHECK-CPP-NEXT:    [[CALL_I:%.*]] = call noundef float @__ocml_cos_f32(float noundef [[TMP5]]) #[[ATTR3]]
 // CHECK-CPP-NEXT:    store float [[CALL_I]], ptr [[L2_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[TMP6:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[RETVAL_ASCAST_I11:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I9]] to ptr
@@ -300,7 +300,7 @@ void test_math_f64(double x, double y, double z) {
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I31:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I29]] to ptr
 // CHECK-CPP-NEXT:    store float [[TMP7]], ptr [[__X_ADDR_ASCAST_I31]], align 4
 // CHECK-CPP-NEXT:    [[TMP8:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I31]], align 4
-// CHECK-CPP-NEXT:    [[TMP9:%.*]] = call float @llvm.fabs.f32(float [[TMP8]])
+// CHECK-CPP-NEXT:    [[TMP9:%.*]] = call noundef float @llvm.fabs.f32(float [[TMP8]])
 // CHECK-CPP-NEXT:    store float [[TMP9]], ptr [[L3_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[TMP10:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[TMP11:%.*]] = load float, ptr [[Y_ADDR_ASCAST]], align 4
@@ -325,7 +325,7 @@ void test_math_f64(double x, double y, double z) {
 // CHECK-CPP-NEXT:    [[TMP16:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I37]], align 4
 // CHECK-CPP-NEXT:    [[TMP17:%.*]] = load float, ptr [[__Y_ADDR_ASCAST_I38]], align 4
 // CHECK-CPP-NEXT:    [[TMP18:%.*]] = load float, ptr [[__Z_ADDR_ASCAST_I39]], align 4
-// CHECK-CPP-NEXT:    [[TMP19:%.*]] = call float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]])
+// CHECK-CPP-NEXT:    [[TMP19:%.*]] = call noundef float @llvm.fma.f32(float [[TMP16]], float [[TMP17]], float [[TMP18]])
 // CHECK-CPP-NEXT:    store float [[TMP19]], ptr [[L4_ASCAST]], align 4
 // CHECK-CPP-NEXT:    ret void
 //
@@ -437,21 +437,21 @@ void test_math_f32(float x, float y, float z) {
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I]] to ptr
 // CHECK-CPP-NEXT:    store float [[TMP0]], ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-CPP-NEXT:    [[TMP1:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
-// CHECK-CPP-NEXT:    [[CALL_I:%.*]] = call float @__ocml_sin_f32(float noundef [[TMP1]]) #[[ATTR3]]
+// CHECK-CPP-NEXT:    [[CALL_I:%.*]] = call noundef float @__ocml_sin_f32(float noundef [[TMP1]]) #[[ATTR3]]
 // CHECK-CPP-NEXT:    store float [[CALL_I]], ptr [[L1_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[TMP2:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[RETVAL_ASCAST_I6:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I4]] to ptr
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I7:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I5]] to ptr
 // CHECK-CPP-NEXT:    store float [[TMP2]], ptr [[__X_ADDR_ASCAST_I7]], align 4
 // CHECK-CPP-NEXT:    [[TMP3:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I7]], align 4
-// CHECK-CPP-NEXT:    [[CALL_I8:%.*]] = call float @__ocml_cos_f32(float noundef [[TMP3]]) #[[ATTR3]]
+// CHECK-CPP-NEXT:    [[CALL_I8:%.*]] = call noundef float @__ocml_cos_f32(float noundef [[TMP3]]) #[[ATTR3]]
 // CHECK-CPP-NEXT:    store float [[CALL_I8]], ptr [[L2_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[TMP4:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[RETVAL_ASCAST_I11:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I9]] to ptr
 // CHECK-CPP-NEXT:    [[__X_ADDR_ASCAST_I12:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I10]] to ptr
 // CHECK-CPP-NEXT:    store float [[TMP4]], ptr [[__X_ADDR_ASCAST_I12]], align 4
 // CHECK-CPP-NEXT:    [[TMP5:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I12]], align 4
-// CHECK-CPP-NEXT:    [[TMP6:%.*]] = call float @llvm.fabs.f32(float [[TMP5]])
+// CHECK-CPP-NEXT:    [[TMP6:%.*]] = call noundef float @llvm.fabs.f32(float [[TMP5]])
 // CHECK-CPP-NEXT:    store float [[TMP6]], ptr [[L3_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[TMP7:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
 // CHECK-CPP-NEXT:    [[TMP8:%.*]] = load float, ptr [[Y_ADDR_ASCAST]], align 4
@@ -466,7 +466,7 @@ void test_math_f32(float x, float y, float z) {
 // CHECK-CPP-NEXT:    [[TMP10:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I16]], align 4
 // CHECK-CPP-NEXT:    [[TMP11:%.*]] = load float, ptr [[__Y_ADDR_ASCAST_I]], align 4
 // CHECK-CPP-NEXT:    [[TMP12:%.*]] = load float, ptr [[__Z_ADDR_ASCAST_I]], align 4
-// CHECK-CPP-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+// CHECK-CPP-NEXT:    [[TMP13:%.*]] = call noundef float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
 // CHECK-CPP-NEXT:    store float [[TMP13]], ptr [[L4_ASCAST]], align 4
 // CHECK-CPP-NEXT:    ret void
 //
diff --git a/clang/test/Headers/amdgcn_openmp_device_math_constexpr.cpp b/clang/test/Headers/amdgcn_openmp_device_math_constexpr.cpp
index ec34b237edd383..a5bb949ccaad3a 100644
--- a/clang/test/Headers/amdgcn_openmp_device_math_constexpr.cpp
+++ b/clang/test/Headers/amdgcn_openmp_device_math_constexpr.cpp
@@ -44,7 +44,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    [[__X_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I]] to ptr
 // CHECK-NEXT:    store float -2.000000e+00, ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call noundef float @llvm.fabs.f32(float [[TMP0]])
 // CHECK-NEXT:    store float [[TMP1]], ptr addrspacecast (ptr addrspace(1) @_ZL19constexpr_fabsf_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -64,7 +64,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    [[__X_ADDR_ASCAST_I_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I_I]] to ptr
 // CHECK-NEXT:    store float [[TMP0]], ptr [[__X_ADDR_ASCAST_I_I]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call noundef float @llvm.fabs.f32(float [[TMP1]])
 // CHECK-NEXT:    store float [[TMP2]], ptr addrspacecast (ptr addrspace(1) @_ZL18constexpr_fabs_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -78,7 +78,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    [[__X_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I]] to ptr
 // CHECK-NEXT:    store float -2.000000e+00, ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
-// CHECK-NEXT:    [[CALL_I:%.*]] = call float @__ocml_sin_f32(float noundef [[TMP0]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = call noundef float @__ocml_sin_f32(float noundef [[TMP0]]) #[[ATTR3:[0-9]+]]
 // CHECK-NEXT:    store float [[CALL_I]], ptr addrspacecast (ptr addrspace(1) @_ZL18constexpr_sinf_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -98,7 +98,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    [[__X_ADDR_ASCAST_I_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I_I]] to ptr
 // CHECK-NEXT:    store float [[TMP0]], ptr [[__X_ADDR_ASCAST_I_I]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I_I]], align 4
-// CHECK-NEXT:    [[CALL_I_I:%.*]] = call float @__ocml_sin_f32(float noundef [[TMP1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I_I:%.*]] = call noundef float @__ocml_sin_f32(float noundef [[TMP1]]) #[[ATTR3]]
 // CHECK-NEXT:    store float [[CALL_I_I]], ptr addrspacecast (ptr addrspace(1) @_ZL17constexpr_sin_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -112,7 +112,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    [[__X_ADDR_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I]] to ptr
 // CHECK-NEXT:    store float -2.000000e+00, ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
-// CHECK-NEXT:    [[CALL_I:%.*]] = call float @__ocml_cos_f32(float noundef [[TMP0]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I:%.*]] = call noundef float @__ocml_cos_f32(float noundef [[TMP0]]) #[[ATTR3]]
 // CHECK-NEXT:    store float [[CALL_I]], ptr addrspacecast (ptr addrspace(1) @_ZL18constexpr_cosf_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -132,7 +132,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    [[__X_ADDR_ASCAST_I_I:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR_I_I]] to ptr
 // CHECK-NEXT:    store float [[TMP0]], ptr [[__X_ADDR_ASCAST_I_I]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I_I]], align 4
-// CHECK-NEXT:    [[CALL_I_I:%.*]] = call float @__ocml_cos_f32(float noundef [[TMP1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL_I_I:%.*]] = call noundef float @__ocml_cos_f32(float noundef [[TMP1]]) #[[ATTR3]]
 // CHECK-NEXT:    store float [[CALL_I_I]], ptr addrspacecast (ptr addrspace(1) @_ZL17constexpr_cos_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -154,7 +154,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__Y_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[__Z_ADDR_ASCAST_I]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fma.f32(float [[TMP0]], float [[TMP1]], float [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call noundef float @llvm.fma.f32(float [[TMP0]], float [[TMP1]], float [[TMP2]])
 // CHECK-NEXT:    store float [[TMP3]], ptr addrspacecast (ptr addrspace(1) @_ZL18constexpr_fmaf_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -190,7 +190,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I_I]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[__Y_ADDR_ASCAST_I_I]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[__Z_ADDR_ASCAST_I_I]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP3]], float [[TMP4]], float [[TMP5]])
+// CHECK-NEXT:    [[TMP6:%.*]] = call noundef float @llvm.fma.f32(float [[TMP3]], float [[TMP4]], float [[TMP5]])
 // CHECK-NEXT:    store float [[TMP6]], ptr addrspacecast (ptr addrspace(1) @_ZL17constexpr_fma_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -208,7 +208,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    store float -4.000000e+00, ptr [[__Y_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__Y_ADDR_ASCAST_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[TMP0]], float [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call noundef float @llvm.minnum.f32(float [[TMP0]], float [[TMP1]])
 // CHECK-NEXT:    store float [[TMP2]], ptr addrspacecast (ptr addrspace(1) @_ZL17constexpr_min_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -226,7 +226,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    store float -4.000000e+00, ptr [[__Y_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__Y_ADDR_ASCAST_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[TMP0]], float [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call noundef float @llvm.maxnum.f32(float [[TMP0]], float [[TMP1]])
 // CHECK-NEXT:    store float [[TMP2]], ptr addrspacecast (ptr addrspace(1) @_ZL17constexpr_max_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -260,7 +260,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    store float -4.000000e+00, ptr [[__Y_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__Y_ADDR_ASCAST_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[TMP0]], float [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call noundef float @llvm.minnum.f32(float [[TMP0]], float [[TMP1]])
 // CHECK-NEXT:    store float [[TMP2]], ptr addrspacecast (ptr addrspace(1) @_ZL19constexpr_fminf_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
@@ -278,7 +278,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
 // CHECK-NEXT:    store float -4.000000e+00, ptr [[__Y_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__X_ADDR_ASCAST_I]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__Y_ADDR_ASCAST_I]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[TMP0]], float [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call noundef float @llvm.maxnum.f32(float [[TMP0]], float [[TMP1]])
 // CHECK-NEXT:    store float [[TMP2]], ptr addrspacecast (ptr addrspace(1) @_ZL19constexpr_fmaxf_f32 to ptr), align 4
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/Headers/hip-header.hip b/clang/test/Headers/hip-header.hip
index e8654c867fcb1f..146a43b643dba1 100644
--- a/clang/test/Headers/hip-header.hip
+++ b/clang/test/Headers/hip-header.hip
@@ -111,7 +111,7 @@ __device__ void test_numeric_type() {
 // instead of fma(_Float16, _Float16, _Float16).
 
 // CXX14-LABEL: define{{.*}}@_Z8test_fma
-// CXX14: call contract half @llvm.fma.f16
+// CXX14: call contract noundef half @llvm.fma.f16
 __device__ double test_fma(_Float16 h, int i) {
   return fma(h, h, i);
 }
@@ -149,12 +149,12 @@ __device__ double test_isnan() {
   double d = 5.0;
   float f = 5.0;
 
-  // AMD_INT_RETURN: call i1 @llvm.is.fpclass.f32(float {{.*}}, i32 3)
-  // AMD_BOOL_RETURN: call i1 @llvm.is.fpclass.f32(float {{.*}}, i32 3)
+  // AMD_INT_RETURN: call noundef i1 @llvm.is.fpclass.f32(float {{.*}}, i32 3)
+  // AMD_BOOL_RETURN: call noundef i1 @llvm.is.fpclass.f32(float {{.*}}, i32 3)
   r += isnan(f);
 
-  // AMD_INT_RETURN: call i1 @llvm.is.fpclass.f64(double {{.*}}, i32 3)
-  // AMD_BOOL_RETURN: call i1 @llvm.is.fpclass.f64(double {{.*}}, i32 3)
+  // AMD_INT_RETURN: call noundef i1 @llvm.is.fpclass.f64(double {{.*}}, i32 3)
+  // AMD_BOOL_RETURN: call noundef i1 @llvm.is.fpclass.f64(double {{.*}}, i32 3)
   r += isnan(d);
 
   return r ;
diff --git a/clang/test/Headers/nvptx_device_cmath_functions.cpp b/clang/test/Headers/nvptx_device_cmath_functions.cpp
index 9e73172fa28e03..898a8544a6e6a8 100644
--- a/clang/test/Headers/nvptx_device_cmath_functions.cpp
+++ b/clang/test/Headers/nvptx_device_cmath_functions.cpp
@@ -12,15 +12,15 @@
 void test_sqrt(double a1) {
   #pragma omp target
   {
-    // CHECK-YES: call double @__nv_sqrt(double
+    // CHECK-YES: call noundef double @__nv_sqrt(double
     double l1 = sqrt(a1);
-    // CHECK-YES: call double @__nv_pow(double
+    // CHECK-YES: call noundef double @__nv_pow(double
     double l2 = pow(a1, a1);
-    // CHECK-YES: call double @__nv_modf(double
+    // CHECK-YES: call noundef double @__nv_modf(double
     double l3 = modf(a1 + 3.5, &a1);
-    // CHECK-YES: call double @__nv_fabs(double
+    // CHECK-YES: call noundef double @__nv_fabs(double
     double l4 = fabs(a1);
-    // CHECK-YES: call i32 @__nv_abs(i32
+    // CHECK-YES: call noundef i32 @__nv_abs(i32
     double l5 = abs((int)a1);
   }
 }
diff --git a/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp b/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
index 8c3cd8779015f7..5a68854a43ea77 100644
--- a/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
+++ b/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
@@ -12,15 +12,15 @@
 void test_sqrt(double a1) {
   #pragma omp target
   {
-    // CHECK-YES: call double @__nv_sqrt(double
+    // CHECK-YES: call noundef double @__nv_sqrt(double
     double l1 = sqrt(a1);
-    // CHECK-YES: call double @__nv_pow(double
+    // CHECK-YES: call noundef double @__nv_pow(double
     double l2 = pow(a1, a1);
-    // CHECK-YES: call double @__nv_modf(double
+    // CHECK-YES: call noundef double @__nv_modf(double
     double l3 = modf(a1 + 3.5, &a1);
-    // CHECK-YES: call double @__nv_fabs(double
+    // CHECK-YES: call noundef double @__nv_fabs(double
     double l4 = fabs(a1);
-    // CHECK-YES: call i32 @__nv_abs(i32
+    // CHECK-YES: call noundef i32 @__nv_abs(i32
     double l5 = abs((int)a1);
   }
 }
diff --git a/clang/test/Headers/nvptx_device_math_functions.c b/clang/test/Headers/nvptx_device_math_functions.c
index 3cbeeafe4b3a29..47781f2206dea7 100644
--- a/clang/test/Headers/nvptx_device_math_functions.c
+++ b/clang/test/Headers/nvptx_device_math_functions.c
@@ -4,9 +4,9 @@
 // REQUIRES: nvptx-registered-target
 
 // RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -x c -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-C
 // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -x c++ -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-CXX
 
 #ifdef __cplusplus
 #include <cstdlib>
@@ -19,15 +19,20 @@
 void test_sqrt(double a1) {
   #pragma omp target
   {
-    // CHECK: call double @__nv_sqrt(double
+    // CHECK-C: call double @__nv_sqrt(double
+    // CHECK-CXX: call noundef double @__nv_sqrt(double
     double l1 = sqrt(a1);
-    // CHECK: call double @__nv_pow(double
+    // CHECK-C: call double @__nv_pow(double
+    // CHECK-CXX: call noundef double @__nv_pow(double
     double l2 = pow(a1, a1);
-    // CHECK: call double @__nv_modf(double
+    // CHECK-C: call double @__nv_modf(double
+    // CHECK-CXX: call noundef double @__nv_modf(double
     double l3 = modf(a1 + 3.5, &a1);
-    // CHECK: call double @__nv_fabs(double
+    // CHECK-C: call double @__nv_fabs(double
+    // CHECK-CXX: call noundef double @__nv_fabs(double
     double l4 = fabs(a1);
-    // CHECK: call i32 @__nv_abs(i32
+    // CHECK-C: call i32 @__nv_abs(i32
+    // CHECK-CXX: call noundef i32 @__nv_abs(i32
     double l5 = abs((int)a1);
   }
 }
diff --git a/clang/test/Headers/nvptx_device_math_functions.cpp b/clang/test/Headers/nvptx_device_math_functions.cpp
index 1bb20c3a24f8dd..791cb49f918c7a 100644
--- a/clang/test/Headers/nvptx_device_math_functions.cpp
+++ b/clang/test/Headers/nvptx_device_math_functions.cpp
@@ -12,15 +12,15 @@
 void test_sqrt(double a1) {
   #pragma omp target
   {
-    // CHECK-YES: call double @__nv_sqrt(double
+    // CHECK-YES: call noundef double @__nv_sqrt(double
     double l1 = sqrt(a1);
-    // CHECK-YES: call double @__nv_pow(double
+    // CHECK-YES: call noundef double @__nv_pow(double
     double l2 = pow(a1, a1);
-    // CHECK-YES: call double @__nv_modf(double
+    // CHECK-YES: call noundef double @__nv_modf(double
     double l3 = modf(a1 + 3.5, &a1);
-    // CHECK-YES: call double @__nv_fabs(double
+    // CHECK-YES: call noundef double @__nv_fabs(double
     double l4 = fabs(a1);
-    // CHECK-YES: call i32 @__nv_abs(i32
+    // CHECK-YES: call noundef i32 @__nv_abs(i32
     double l5 = abs((int)a1);
   }
 }
diff --git a/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp b/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
index e6fd867c955fae..aa759ec6a54c88 100644
--- a/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
+++ b/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
@@ -12,15 +12,15 @@
 void test_sqrt(double a1) {
   #pragma omp target
   {
-    // CHECK-YES: call double @__nv_sqrt(double
+    // CHECK-YES: call noundef double @__nv_sqrt(double
     double l1 = sqrt(a1);
-    // CHECK-YES: call double @__nv_pow(double
+    // CHECK-YES: call noundef double @__nv_pow(double
     double l2 = pow(a1, a1);
-    // CHECK-YES: call double @__nv_modf(double
+    // CHECK-YES: call noundef double @__nv_modf(double
     double l3 = modf(a1 + 3.5, &a1);
-    // CHECK-YES: call double @__nv_fabs(double
+    // CHECK-YES: call noundef double @__nv_fabs(double
     double l4 = fabs(a1);
-    // CHECK-YES: call i32 @__nv_abs(i32
+    // CHECK-YES: call noundef i32 @__nv_abs(i32
     double l5 = abs((int)a1);
   }
 }
diff --git a/clang/test/Headers/nvptx_device_math_modf.cpp b/clang/test/Headers/nvptx_device_math_modf.cpp
index eb9f57538a625d..fdd87c5915e5b2 100644
--- a/clang/test/Headers/nvptx_device_math_modf.cpp
+++ b/clang/test/Headers/nvptx_device_math_modf.cpp
@@ -7,13 +7,13 @@
 // 4 calls to modf(f), all translated to __nv_modf calls:
 
 // CHECK-NOT: _Z.modf
-// CHECK: call double @__nv_modf(double
+// CHECK: call noundef double @__nv_modf(double
 // CHECK-NOT: _Z.modf
-// CHECK: call float @__nv_modff(float
+// CHECK: call noundef float @__nv_modff(float
 // CHECK-NOT: _Z.modf
-// CHECK: call double @__nv_modf(double
+// CHECK: call noundef double @__nv_modf(double
 // CHECK-NOT: _Z.modf
-// CHECK: call float @__nv_modff(float
+// CHECK: call noundef float @__nv_modff(float
 // CHECK-NOT: _Z.modf
 
 template<typename T>
diff --git a/clang/test/Headers/nvptx_device_math_sin.cpp b/clang/test/Headers/nvptx_device_math_sin.cpp
index 3f4fe5bff6b2e0..5db132f02c9ae7 100644
--- a/clang/test/Headers/nvptx_device_math_sin.cpp
+++ b/clang/test/Headers/nvptx_device_math_sin.cpp
@@ -9,11 +9,11 @@
 
 double math(float f, double d) {
   double r = 0;
-// SLOW:  call float @__nv_sinf(float
-// FAST:  call fast nofpclass(nan inf) float @__nv_fast_sinf(float
+// SLOW:  call noundef float @__nv_sinf(float
+// FAST:  call fast noundef nofpclass(nan inf) float @__nv_fast_sinf(float
   r += sin(f);
-// SLOW:  call double @__nv_sin(double
-// FAST:  call fast nofpclass(nan inf) double @__nv_sin(double
+// SLOW:  call noundef double @__nv_sin(double
+// FAST:  call fast noundef nofpclass(nan inf) double @__nv_sin(double
   r += sin(d);
   return r;
 }
diff --git a/clang/test/Headers/nvptx_device_math_sin_cos.cpp b/clang/test/Headers/nvptx_device_math_sin_cos.cpp
index 98e7b3feb6b999..9be082802303b9 100644
--- a/clang/test/Headers/nvptx_device_math_sin_cos.cpp
+++ b/clang/test/Headers/nvptx_device_math_sin_cos.cpp
@@ -8,22 +8,22 @@
 
 // CHECK-NOT: _Z.sin
 // CHECK-NOT: _Z.cos
-// CHECK: call double @__nv_sin(double
+// CHECK: call noundef double @__nv_sin(double
 // CHECK-NOT: _Z.sin
 // CHECK-NOT: _Z.cos
-// CHECK: call float @__nv_sinf(float
+// CHECK: call noundef float @__nv_sinf(float
 // CHECK-NOT: _Z.sin
 // CHECK-NOT: _Z.cos
-// CHECK: call double @__nv_sin(double
+// CHECK: call noundef double @__nv_sin(double
 // CHECK-NOT: _Z.sin
 // CHECK-NOT: _Z.cos
-// CHECK: call double @__nv_cos(double
+// CHECK: call noundef double @__nv_cos(double
 // CHECK-NOT: _Z.sin
 // CHECK-NOT: _Z.cos
-// CHECK: call float @__nv_sinf(float
+// CHECK: call noundef float @__nv_sinf(float
 // CHECK-NOT: _Z.sin
 // CHECK-NOT: _Z.cos
-// CHECK: call float @__nv_cosf(float
+// CHECK: call noundef float @__nv_cosf(float
 // CHECK-NOT: _Z.sin
 // CHECK-NOT: _Z.cos
 
diff --git a/clang/test/Headers/openmp_device_math_isnan.cpp b/clang/test/Headers/openmp_device_math_isnan.cpp
index 0a632abcd8b06d..a297cfc5b9293d 100644
--- a/clang/test/Headers/openmp_device_math_isnan.cpp
+++ b/clang/test/Headers/openmp_device_math_isnan.cpp
@@ -20,17 +20,17 @@
 
 double math(float f, double d) {
   double r = 0;
-  // INT_RETURN: call i32 @__nv_isnanf(float
+  // INT_RETURN: call noundef i32 @__nv_isnanf(float
   // AMD_INT_RETURN_SAFE: call i1 @llvm.is.fpclass.f32(float{{.*}}, i32 3)
   // AMD_INT_RETURN_FAST: sitofp i32 {{.*}} to double
-  // BOOL_RETURN: call i32 @__nv_isnanf(float
+  // BOOL_RETURN: call noundef i32 @__nv_isnanf(float
   // AMD_BOOL_RETURN_SAFE: call i1 @llvm.is.fpclass.f32(float{{.*}}, i32 3)
   // AMD_BOOL_RETURN_FAST: icmp ne i32 {{.*}}, 0
   r += std::isnan(f);
-  // INT_RETURN: call i32 @__nv_isnand(double
+  // INT_RETURN: call noundef i32 @__nv_isnand(double
   // AMD_INT_RETURN_SAFE: call i1 @llvm.is.fpclass.f64(double{{.*}}, i32 3)
   // AMD_INT_RETURN_FAST: sitofp i32 {{.*}} to double
-  // BOOL_RETURN: call i32 @__nv_isnand(double
+  // BOOL_RETURN: call noundef i32 @__nv_isnand(double
   // AMD_BOOL_RETURN_SAFE: call i1 @llvm.is.fpclass.f64(double{{.*}}, i32 3)
   // AMD_BOOL_RETURN_FAST: icmp ne i32 {{.*}}, 0
   r += std::isnan(d);
diff --git a/clang/test/Headers/stdarg.c b/clang/test/Headers/stdarg.c
index 49df42caa3300a..1c403d109bf7c3 100644
--- a/clang/test/Headers/stdarg.c
+++ b/clang/test/Headers/stdarg.c
@@ -1,29 +1,47 @@
+// RUN: rm -fR %t
 // RUN: split-file %s %t
 // RUN: %clang_cc1 -fsyntax-only -verify=c89 -Werror=implicit-function-declaration -std=c89 %t/stdarg0.c
 // RUN: %clang_cc1 -fsyntax-only -verify=c99 -Werror=implicit-function-declaration -std=c99 %t/stdarg0.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c89-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c89 %t/stdarg0.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c99-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c99 %t/stdarg0.c
 // RUN: %clang_cc1 -fsyntax-only -verify=c89 -Werror=implicit-function-declaration -std=c89 %t/stdarg1.c
 // RUN: %clang_cc1 -fsyntax-only -verify=c99 -Werror=implicit-function-declaration -std=c99 %t/stdarg1.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c89-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c89 %t/stdarg1.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c99-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c99 %t/stdarg1.c
 
 // Split the file so that the "implicitly declaring library function" errors get repeated.
 
 //--- stdarg0.c
 static void f(int p, ...) {
-    __gnuc_va_list g; // c89-error{{undeclared identifier '__gnuc_va_list'}} c99-error{{undeclared identifier}}
-    va_list v; // c89-error{{undeclared identifier 'va_list'}} c99-error{{undeclared identifier}}
+    __gnuc_va_list g; // c89-error{{undeclared identifier '__gnuc_va_list'}} c99-error{{undeclared identifier}} \
+                         c89-modules-error{{undeclared identifier}} c99-modules-error{{undeclared identifier}}
+    va_list v; // c89-error{{undeclared identifier 'va_list'}} c99-error{{undeclared identifier}} \
+                  c89-modules-error{{undeclared identifier}} c99-modules-error{{undeclared identifier}}
     va_start(v, p); // c89-error{{implicitly declaring library function 'va_start'}} c89-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_start'}} c89-error{{undeclared identifier 'v'}} \
-                       c99-error{{call to undeclared library function 'va_start'}} c99-note{{provide a declaration}} c99-error{{undeclared identifier}}
+                       c99-error{{call to undeclared library function 'va_start'}} c99-note{{provide a declaration}} c99-error{{undeclared identifier}} \
+                       c89-modules-error{{implicitly declaring library function}} c89-modules-note{{provide a declaration}} c89-modules-error{{undeclared identifier}} \
+                       c99-modules-error{{undeclared library function}} c99-modules-note{{provide a declaration}} c99-modules-error{{undeclared identifier}}
     int i = va_arg(v, int); // c89-error{{implicit declaration of function 'va_arg'}} c89-error{{expected expression}} c89-error{{use of undeclared identifier 'v'}} \
-                               c99-error{{call to undeclared function 'va_arg'}} c99-error{{expected expression}} c99-error{{undeclared identifier}}
+                               c99-error{{call to undeclared function 'va_arg'}} c99-error{{expected expression}} c99-error{{undeclared identifier}} \
+                               c89-modules-error{{implicit declaration of function}} c89-modules-error{{expected expression}} c89-modules-error{{undeclared identifier}} \
+                               c99-modules-error{{undeclared function}} c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}}
     va_end(v); // c89-error{{implicitly declaring library function 'va_end'}} c89-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_end'}} c89-error{{undeclared identifier 'v'}} \
-                  c99-error{{call to undeclared library function 'va_end'}} c99-note{{provide a declaration}} c99-error{{undeclared identifier}}
+                  c99-error{{call to undeclared library function 'va_end'}} c99-note{{provide a declaration}} c99-error{{undeclared identifier}} \
+                  c89-modules-error{{implicitly declaring library function}} c89-modules-note{{provide a declaration}} c89-modules-error{{undeclared identifier}} \
+                  c99-modules-error{{undeclared library function}} c99-modules-note{{provide a declaration}} c99-modules-error{{undeclared identifier}}
     __va_copy(g, v); // c89-error{{implicit declaration of function '__va_copy'}} c89-error{{use of undeclared identifier 'g'}} c89-error{{use of undeclared identifier 'v'}} \
-                        c99-error{{call to undeclared function '__va_copy'}} c99-error{{undeclared identifier}} c99-error{{undeclared identifier}}
+                        c99-error{{call to undeclared function '__va_copy'}} c99-error{{undeclared identifier}} c99-error{{undeclared identifier}} \
+                        c89-modules-error{{implicit declaration of function}} c89-modules-error{{undeclared identifier}} c89-modules-error{{undeclared identifier}} \
+                        c99-modules-error{{undeclared function}} c99-modules-error{{undeclared identifier}} c99-modules-error{{undeclared identifier}}
     va_copy(g, v); // c89-error{{implicitly declaring library function 'va_copy'}} c89-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_copy'}} c89-error{{use of undeclared identifier 'g'}} c89-error{{use of undeclared identifier 'v'}} \
-                      c99-error{{call to undeclared library function 'va_copy'}} c99-note{{provide a declaration}} c99-error{{undeclared identifier}} c99-error{{undeclared identifier}}
+                      c99-error{{call to undeclared library function 'va_copy'}} c99-note{{provide a declaration}} c99-error{{undeclared identifier}} c99-error{{undeclared identifier}} \
+                      c89-modules-error{{implicitly declaring library function}} c89-modules-note{{provide a declaration}} c89-modules-error{{undeclared identifier}} c89-modules-error{{undeclared identifier}} \
+                      c99-modules-error{{undeclared library function}} c99-modules-note{{provide a declaration}} c99-modules-error{{undeclared identifier}} c99-modules-error{{undeclared identifier}}
 }
 
 //--- stdarg1.c
 // c99-no-diagnostics
+// c99-modules-no-diagnostics
 
 #include <stdarg.h>
 static void f(int p, ...) {
@@ -33,5 +51,6 @@ static void f(int p, ...) {
     int i = va_arg(v, int);
     va_end(v);
     __va_copy(g, v);
-    va_copy(g, v); // c89-error{{implicitly declaring library function}} c89-note{{provide a declaration}}
+    va_copy(g, v); // c89-error{{implicitly declaring library function}} c89-note{{provide a declaration}} \
+                      c89-modules-error{{implicitly declaring library function}} c89-modules-note{{provide a declaration}}
 }
diff --git a/clang/test/Headers/stdargneeds.c b/clang/test/Headers/stdargneeds.c
index 7505ff55731f14..8ef76e90deaef0 100644
--- a/clang/test/Headers/stdargneeds.c
+++ b/clang/test/Headers/stdargneeds.c
@@ -1,23 +1,35 @@
+// RUN: rm -fR %t
 // RUN: split-file %s %t
 // RUN: %clang_cc1 -fsyntax-only -verify -Werror=implicit-function-declaration -std=c89 %t/stdargneeds0.c
+// RUN: %clang_cc1 -fsyntax-only -verify=expected-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c89 %t/stdargneeds0.c
 // RUN: %clang_cc1 -fsyntax-only -verify -Werror=implicit-function-declaration -std=c89 %t/stdargneeds1.c
+// RUN: %clang_cc1 -fsyntax-only -verify=expected-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c89 %t/stdargneeds1.c
 // RUN: %clang_cc1 -fsyntax-only -verify -Werror=implicit-function-declaration -std=c89 %t/stdargneeds2.c
+// RUN: %clang_cc1 -fsyntax-only -verify=expected-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c89 %t/stdargneeds2.c
 // RUN: %clang_cc1 -fsyntax-only -verify -Werror=implicit-function-declaration -std=c89 %t/stdargneeds3.c
+// RUN: %clang_cc1 -fsyntax-only -verify=expected-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c89 %t/stdargneeds3.c
 // RUN: %clang_cc1 -fsyntax-only -verify -Werror=implicit-function-declaration -std=c89 %t/stdargneeds4.c
+// RUN: %clang_cc1 -fsyntax-only -verify=expected-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c89 %t/stdargneeds4.c
 // RUN: %clang_cc1 -fsyntax-only -verify -Werror=implicit-function-declaration -std=c89 %t/stdargneeds5.c
+// RUN: %clang_cc1 -fsyntax-only -verify=expected-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Werror=implicit-function-declaration -std=c89 %t/stdargneeds5.c
 
 // Split the file so that the "implicitly declaring library function" errors get repeated.
 // Use C89 to verify that __need_ can be used to get types that wouldn't normally be available.
 
 //--- stdargneeds0.c
 static void f(int p, ...) {
-    __gnuc_va_list g; // expected-error{{undeclared identifier '__gnuc_va_list'}}
-    va_list v; // expected-error{{undeclared identifier 'va_list'}}
-    va_start(v, p); // expected-error{{implicitly declaring library function 'va_start'}} expected-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_start'}} expected-error{{undeclared identifier 'v'}}
-    int i = va_arg(v, int); // expected-error{{implicit declaration of function 'va_arg'}} expected-error{{expected expression}} expected-error{{use of undeclared identifier 'v'}}
-    va_end(v); // expected-error{{implicitly declaring library function 'va_end'}} expected-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_end'}} expected-error{{undeclared identifier 'v'}}
-    __va_copy(g, v); // expected-error{{implicit declaration of function '__va_copy'}} expected-error{{use of undeclared identifier 'g'}} expected-error{{use of undeclared identifier 'v'}}
-    va_copy(g, v); // expected-error{{implicitly declaring library function 'va_copy'}} expected-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_copy'}} expected-error{{use of undeclared identifier 'g'}} expected-error{{use of undeclared identifier 'v'}}
+    __gnuc_va_list g; // expected-error{{undeclared identifier '__gnuc_va_list'}} expected-modules-error{{undeclared identifier}}
+    va_list v; // expected-error{{undeclared identifier 'va_list'}} expected-modules-error{{undeclared identifier}}
+    va_start(v, p); // expected-error{{implicitly declaring library function 'va_start'}} expected-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_start'}} expected-error{{undeclared identifier 'v'}} \
+                       expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}} expected-modules-error{{undeclared identifier}}
+    int i = va_arg(v, int); // expected-error{{implicit declaration of function 'va_arg'}} expected-error{{expected expression}} expected-error{{use of undeclared identifier 'v'}} \
+                               expected-modules-error{{implicit declaration of function}} expected-modules-error{{expected expression}} expected-modules-error{{undeclared identifier}}
+    va_end(v); // expected-error{{implicitly declaring library function 'va_end'}} expected-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_end'}} expected-error{{undeclared identifier 'v'}} \
+                  expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}} expected-modules-error{{undeclared identifier}}
+    __va_copy(g, v); // expected-error{{implicit declaration of function '__va_copy'}} expected-error{{use of undeclared identifier 'g'}} expected-error{{use of undeclared identifier 'v'}} \
+                        expected-modules-error{{implicit declaration of function}} expected-modules-error{{undeclared identifier}} expected-modules-error{{undeclared identifier}}
+    va_copy(g, v); // expected-error{{implicitly declaring library function 'va_copy'}} expected-note{{include the header <stdarg.h> or explicitly provide a declaration for 'va_copy'}} expected-error{{use of undeclared identifier 'g'}} expected-error{{use of undeclared identifier 'v'}} \
+                      expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}} expected-modules-error{{undeclared identifier}} expected-modules-error{{undeclared identifier}}
 }
 
 //--- stdargneeds1.c
@@ -25,25 +37,37 @@ static void f(int p, ...) {
 #include <stdarg.h>
 static void f(int p, ...) {
     __gnuc_va_list g;
-    va_list v; // expected-error{{undeclared identifier}}
-    va_start(v, p); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}}
-    int i = va_arg(v, int); // expected-error{{implicit declaration of function}} expected-error{{expected expression}} expected-error{{undeclared identifier}}
-    va_end(v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}}
-    __va_copy(g, v); // expected-error{{implicit declaration of function}} expected-error{{undeclared identifier}}
-    va_copy(g, v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}}
+    va_list v; // expected-error{{undeclared identifier}} \
+                  expected-modules-error{{'va_list' must be declared before it is used}} expected-modules-note@__stdarg_va_list.h:*{{declaration here is not visible}}
+    va_start(v, p); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}} \
+                       expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
+    int i = va_arg(v, int); // expected-error{{implicit declaration of function}} expected-error{{expected expression}} expected-error{{undeclared identifier}} \
+                               expected-modules-error{{implicit declaration of function}} expected-modules-error{{expected expression}}
+    va_end(v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}} \
+                  expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
+    __va_copy(g, v); // expected-error{{implicit declaration of function}} expected-error{{undeclared identifier}} \
+                        expected-modules-error{{implicit declaration of function}}
+    va_copy(g, v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}} \
+                      expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
 }
 
 //--- stdargneeds2.c
 #define __need_va_list
 #include <stdarg.h>
 static void f(int p, ...) {
-    __gnuc_va_list g; // expected-error{{undeclared identifier}}
+    __gnuc_va_list g; // expected-error{{undeclared identifier}} \
+                         expected-modules-error{{'__gnuc_va_list' must be declared before it is used}} expected-modules-note@__stdarg___gnuc_va_list.h:*{{declaration here is not visible}}
     va_list v;
-    va_start(v, p); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}}
-    int i = va_arg(v, int); // expected-error{{implicit declaration of function}} expected-error{{expected expression}}
-    va_end(v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}}
-    __va_copy(g, v); // expected-error{{implicit declaration of function}} expected-error{{undeclared identifier}}
-    va_copy(g, v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}}
+    va_start(v, p); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} \
+                       expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
+    int i = va_arg(v, int); // expected-error{{implicit declaration of function}} expected-error{{expected expression}} \
+                               expected-modules-error{{implicit declaration of function}} expected-modules-error{{expected expression}}
+    va_end(v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} \
+                  expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
+    __va_copy(g, v); // expected-error{{implicit declaration of function}} expected-error{{undeclared identifier}} \
+                        expected-modules-error{{implicit declaration of function}}
+    va_copy(g, v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}} \
+                      expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
 }
 
 //--- stdargneeds3.c
@@ -51,13 +75,16 @@ static void f(int p, ...) {
 #define __need_va_arg
 #include <stdarg.h>
 static void f(int p, ...) {
-    __gnuc_va_list g; // expected-error{{undeclared identifier}}
+    __gnuc_va_list g; // expected-error{{undeclared identifier}} \
+                         expected-modules-error{{'__gnuc_va_list' must be declared before it is used}} expected-modules-note@__stdarg___gnuc_va_list.h:*{{declaration here is not visible}}
     va_list v;
     va_start(v, p);
     int i = va_arg(v, int);
     va_end(v);
-    __va_copy(g, v); // expected-error{{implicit declaration of function}} expected-error{{undeclared identifier}}
-    va_copy(g, v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}}
+    __va_copy(g, v); // expected-error{{implicit declaration of function}} expected-error{{undeclared identifier}} \
+                        expected-modules-error{{implicit declaration of function}}
+    va_copy(g, v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} expected-error{{undeclared identifier}} \
+                      expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
 }
 
 //--- stdargneeds4.c
@@ -68,11 +95,15 @@ static void f(int p, ...) {
 static void f(int p, ...) {
     __gnuc_va_list g;
     va_list v;
-    va_start(v, p); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}}
-    int i = va_arg(v, int); // expected-error{{implicit declaration of function}} expected-error{{expected expression}}
-    va_end(v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}}
+    va_start(v, p); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} \
+                       expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
+    int i = va_arg(v, int); // expected-error{{implicit declaration of function}} expected-error{{expected expression}} \
+                               expected-modules-error{{implicit declaration of function}} expected-modules-error{{expected expression}}
+    va_end(v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} \
+                  expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
     __va_copy(g, v);
-    va_copy(g, v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}}
+    va_copy(g, v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} \
+                      expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
 }
 
 //--- stdargneeds5.c
@@ -83,9 +114,12 @@ static void f(int p, ...) {
 static void f(int p, ...) {
     __gnuc_va_list g;
     va_list v;
-    va_start(v, p); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}}
-    int i = va_arg(v, int); // expected-error{{implicit declaration of function}} expected-error{{expected expression}}
-    va_end(v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}}
-    __va_copy(g, v); // expected-error{{implicit declaration of function}}
+    va_start(v, p); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} \
+                       expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
+    int i = va_arg(v, int); // expected-error{{implicit declaration of function}} expected-error{{expected expression}} \
+                               expected-modules-error{{implicit declaration of function}} expected-modules-error{{expected expression}}
+    va_end(v); // expected-error{{implicitly declaring library function}} expected-note{{provide a declaration}} \
+                  expected-modules-error{{implicitly declaring library function}} expected-modules-note{{provide a declaration}}
+    __va_copy(g, v); // expected-error{{implicit declaration of function}} expected-modules-error{{implicit declaration of function}}
     va_copy(g, v);
 }
diff --git a/clang/test/Headers/stddef.c b/clang/test/Headers/stddef.c
index d2bceb84a0ba0b..bf564fb15221ae 100644
--- a/clang/test/Headers/stddef.c
+++ b/clang/test/Headers/stddef.c
@@ -1,35 +1,57 @@
+// RUN: rm -fR %t
 // RUN: %clang_cc1 -fsyntax-only -verify=c99 -std=c99 %s
 // RUN: %clang_cc1 -fsyntax-only -verify=c11 -std=c11 %s
 // RUN: %clang_cc1 -fsyntax-only -verify=c23 -std=c23 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=c99-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c99 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=c11-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=c23-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c23 %s
 
 struct astruct { char member; };
 
-ptrdiff_t p0; // c99-error{{unknown type name 'ptrdiff_t'}} c11-error{{unknown type}} c23-error{{unknown type}}
-size_t s0; // c99-error{{unknown type name 'size_t'}} c11-error{{unknown type}} c23-error{{unknown type}}
-rsize_t r0; // c99-error{{unknown type name 'rsize_t'}} c11-error{{unknown type}} c23-error{{unknown type}}
-wchar_t wc0; // c99-error{{unknown type name 'wchar_t'}} c11-error{{unknown type}} c23-error{{unknown type}}
-void *v0 = NULL; // c99-error{{use of undeclared identifier 'NULL'}} c11-error{{undeclared identifier}} c23-error{{undeclared identifier}}
-nullptr_t n0; // c99-error{{unknown type name 'nullptr_t'}} c11-error{{unknown type}} c23-error{{unknown type}}
-static void f0(void) { unreachable(); } // c99-error{{call to undeclared function 'unreachable'}} c11-error{{undeclared function}} c23-error{{undeclared identifier}}
-max_align_t m0; // c99-error{{unknown type name 'max_align_t'}} c11-error{{unknown type}} c23-error{{unknown type}}
+ptrdiff_t p0; // c99-error{{unknown type name 'ptrdiff_t'}} c11-error{{unknown type}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+size_t s0; // c99-error{{unknown type name 'size_t'}} c11-error{{unknown type}} c23-error{{unknown type}} \
+              c99-modules-error{{unknown type}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+rsize_t r0; // c99-error{{unknown type name 'rsize_t'}} c11-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+wchar_t wc0; // c99-error{{unknown type name 'wchar_t'}} c11-error{{unknown type}} c23-error{{unknown type}} \
+                c99-modules-error{{unknown type}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+void *v0 = NULL; // c99-error{{use of undeclared identifier 'NULL'}} c11-error{{undeclared identifier}} c23-error{{undeclared identifier}} \
+                    c99-modules-error{{undeclared identifier}} c11-modules-error{{undeclared identifier}} c23-modules-error{{undeclared identifier}}
+nullptr_t n0; // c99-error{{unknown type name 'nullptr_t'}} c11-error{{unknown type}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+static void f0(void) { unreachable(); } // c99-error{{call to undeclared function 'unreachable'}} c11-error{{undeclared function}} c23-error{{undeclared identifier}} \
+                                           c99-modules-error{{undeclared function}} c11-modules-error{{undeclared function}} c23-modules-error{{undeclared identifier}}
+max_align_t m0; // c99-error{{unknown type name 'max_align_t'}} c11-error{{unknown type}} c23-error{{unknown type}} \
+                   c99-modules-error{{unknown type}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 size_t o0 = offsetof(struct astruct, member); // c99-error{{unknown type name 'size_t'}} c99-error{{call to undeclared function 'offsetof'}} c99-error{{expected expression}} c99-error{{use of undeclared identifier 'member'}} \
                                                  c11-error{{unknown type}} c11-error{{undeclared function}} c11-error{{expected expression}} c11-error{{undeclared identifier}} \
-                                                 c23-error{{unknown type}} c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi0; // c99-error{{unknown type name 'wint_t'}} c11-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{unknown type}} c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{unknown type}} c99-modules-error{{undeclared function}} c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c11-modules-error{{unknown type}} c11-modules-error{{undeclared function}} c11-modules-error{{expected expression}} c11-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{unknown type}} c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi0; // c99-error{{unknown type name 'wint_t'}} c11-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type name 'wint_t'}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #include <stddef.h>
 
 ptrdiff_t p1;
 size_t s1;
-rsize_t r1; // c99-error{{unknown type}} c11-error{{unknown type}} c23-error{{unknown type}}
-            // c99-note@__stddef_size_t.h:*{{'size_t' declared here}} c11-note@__stddef_size_t.h:*{{'size_t' declared here}} c23-note@__stddef_size_t.h:*{{'size_t' declared here}}
+rsize_t r1; // c99-error{{unknown type}} c11-error{{unknown type}} c23-error{{unknown type}} \
+               c99-note@__stddef_size_t.h:*{{'size_t' declared here}} c11-note@__stddef_size_t.h:*{{'size_t' declared here}} c23-note@__stddef_size_t.h:*{{'size_t' declared here}} \
+               c99-modules-error{{'rsize_t' must be declared before it is used}} c11-modules-error{{must be declared}} c23-modules-error{{must be declared}} \
+               c99-modules-note@__stddef_rsize_t.h:*{{declaration here is not visible}} c11-modules-note@__stddef_rsize_t.h:*{{declaration here is not visible}} c23-modules-note@__stddef_rsize_t.h:*{{declaration here is not visible}}
 wchar_t wc1;
 void *v1 = NULL;
-nullptr_t n1; // c99-error{{unknown type}} c11-error{{unknown type}}
-static void f1(void) { unreachable(); } // c99-error{{undeclared function}} c11-error{{undeclared function}}
-max_align_t m1; // c99-error{{unknown type}}
+nullptr_t n1; // c99-error{{unknown type}} c11-error{{unknown type}} \
+                 c99-modules-error{{unknown type}} c11-modules-error{{unknown type}}
+static void f1(void) { unreachable(); } // c99-error{{undeclared function}} c11-error{{undeclared function}} \
+                                           c99-modules-error{{undeclared function}} c11-modules-error{{undeclared function}}
+max_align_t m1; // c99-error{{unknown type}} c99-modules-error{{'max_align_t' must be declared before it is used}} \
+                   c99-modules-note@__stddef_max_align_t.h:*{{declaration here is not visible}}
 size_t o1 = offsetof(struct astruct, member);
-wint_t wi1; // c99-error{{unknown type}} c11-error{{unknown type}} c23-error{{unknown type}}
+wint_t wi1; // c99-error{{unknown type}} c11-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 // rsize_t needs to be opted into via __STDC_WANT_LIB_EXT1__ >= 1.
 #define __STDC_WANT_LIB_EXT1__ 1
@@ -39,8 +61,14 @@ size_t s2;
 rsize_t r2;
 wchar_t wc2;
 void *v2 = NULL;
-nullptr_t n2; // c99-error{{unknown type}} c11-error{{unknown type}}
-static void f2(void) { unreachable(); } // c99-error{{undeclared function}} c11-error{{undeclared function}}
+nullptr_t n2; // c99-error{{unknown type}} c11-error{{unknown type}} \
+                 c99-modules-error{{unknown type}} c11-modules-error{{unknown type}}
+static void f2(void) { unreachable(); } // c99-error{{undeclared function}} c11-error{{undeclared function}} \
+                                           c99-modules-error{{undeclared function}} c11-modules-error{{undeclared function}}
 max_align_t m2; // c99-error{{unknown type}}
 size_t o2 = offsetof(struct astruct, member);
-wint_t wi2; // c99-error{{unknown type}} c11-error{{unknown type}} c23-error{{unknown type}}
+wint_t wi2; // c99-error{{unknown type}} c11-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c11-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+
+// m2 and wi2 don't generate errors in modules, the "must be declared before used"
+// errors are only emitted the first time the known-but-not-visible type is seen.
diff --git a/clang/test/Headers/stddefneeds.c b/clang/test/Headers/stddefneeds.c
index dd7ecd0e6028da..fae7d7041d6af5 100644
--- a/clang/test/Headers/stddefneeds.c
+++ b/clang/test/Headers/stddefneeds.c
@@ -1,36 +1,72 @@
+// RUN: rm -fR %t
 // RUN: %clang_cc1 -fsyntax-only -verify=c99 -std=c99 %s
 // RUN: %clang_cc1 -fsyntax-only -verify=c23 -std=c23 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=c99-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c99 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=c23-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c23 %s
 
 // Use C99 to verify that __need_ can be used to get types that wouldn't normally be available.
 
 struct astruct { char member; };
 
-ptrdiff_t p0; // c99-error{{unknown type name 'ptrdiff_t'}} c23-error{{unknown type}}
-size_t s0; // c99-error{{unknown type name 'size_t'}} c23-error{{unknown type}}
-rsize_t r0; // c99-error{{unknown type name 'rsize_t'}} c23-error{{unknown type}}
-wchar_t wc0; // c99-error{{unknown type name 'wchar_t'}} c23-error{{unknown type}}
-void *v0 = NULL; // c99-error{{use of undeclared identifier 'NULL'}} c23-error{{undeclared identifier}}
-nullptr_t n0; // c99-error{{unknown type name 'nullptr_t'}} c23-error{{unknown type}}
-static void f0(void) { unreachable(); } // c99-error{{call to undeclared function 'unreachable'}} c23-error{{undeclared identifier 'unreachable'}}
-max_align_t m0; // c99-error{{unknown type name 'max_align_t'}} c23-error{{unknown type}}
+ptrdiff_t p0; // c99-error{{unknown type name 'ptrdiff_t'}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+size_t s0; // c99-error{{unknown type name 'size_t'}} c23-error{{unknown type}} \
+              c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+rsize_t r0; // c99-error{{unknown type name 'rsize_t'}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+wchar_t wc0; // c99-error{{unknown type name 'wchar_t'}} c23-error{{unknown type}} \
+                c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+void *v0 = NULL; // c99-error{{use of undeclared identifier 'NULL'}} c23-error{{undeclared identifier}} \
+                    c99-modules-error{{undeclared identifier}} c23-modules-error{{undeclared identifier}}
+nullptr_t n0; // c99-error{{unknown type name 'nullptr_t'}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+static void f0(void) { unreachable(); } // c99-error{{call to undeclared function 'unreachable'}} c23-error{{undeclared identifier 'unreachable'}} \
+                                           c99-modules-error{{undeclared function}} c23-modules-error{{undeclared identifier}}
+max_align_t m0; // c99-error{{unknown type name 'max_align_t'}} c23-error{{unknown type}} \
+                   c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 size_t o0 = offsetof(struct astruct, member); // c99-error{{unknown type name 'size_t'}} c99-error{{call to undeclared function 'offsetof'}} c99-error{{expected expression}} c99-error{{use of undeclared identifier 'member'}} \
-                                                 c23-error{{unknown type name 'size_t'}} c23-error{{undeclared identifier 'offsetof'}} c23-error{{expected expression}} c23-error{{use of undeclared identifier 'member'}}
-wint_t wi0; // c99-error{{unknown type name 'wint_t'}} c23-error{{unknown type}}
+                                                 c23-error{{unknown type name 'size_t'}} c23-error{{undeclared identifier 'offsetof'}} c23-error{{expected expression}} c23-error{{use of undeclared identifier 'member'}} \
+                                                 c99-modules-error{{unknown type}} c99-modules-error{{undeclared function}} c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{unknown type}} c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi0; // c99-error{{unknown type name 'wint_t'}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #define __need_ptrdiff_t
 #include <stddef.h>
 
 ptrdiff_t p1;
-size_t s1; // c99-error{{unknown type}} c23-error{{unknown type}}
-rsize_t r1; // c99-error{{unknown type}} c23-error{{unknown type}}
-wchar_t wc1; // c99-error{{unknown type}} c23-error{{unknown type}}
-void *v1 = NULL; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}}
-nullptr_t n1; // c99-error{{unknown type}} c23-error{{unknown type}}
-static void f1(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}}
-max_align_t m1; // c99-error{{unknown type}} c23-error{{unknown type}}
+size_t s1; // c99-error{{unknown type}} c23-error{{unknown type}} \
+              c99-modules-error{{'size_t' must be declared before it is used}} c23-modules-error{{must be declared}} \
+              c99-modules-note@__stddef_size_t.h:*{{declaration here is not visible}} c23-modules-note@__stddef_size_t.h:*{{declaration here is not visible}}
+rsize_t r1; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{'rsize_t' must be declared before it is used}} c23-modules-error{{must be declared}} \
+               c99-modules-note@__stddef_rsize_t.h:*{{declaration here is not visible}} c23-modules-note@__stddef_rsize_t.h:*{{declaration here is not visible}}
+wchar_t wc1; // c99-error{{unknown type}} c23-error{{unknown type}} \
+                c99-modules-error{{'wchar_t' must be declared before it is used}} c23-modules-error{{must be declared}} \
+                c99-modules-note@__stddef_wchar_t.h:*{{declaration here is not visible}} c23-modules-note@__stddef_wchar_t.h:*{{declaration here is not visible}}
+void *v1 = NULL; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}} \
+                    c99-modules-error{{undeclared identifier}} c23-modules-error{{undeclared identifier}}
+nullptr_t n1; // c99-error{{unknown type}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}} c23-modules-error{{'nullptr_t' must be declared before it is used}} \
+                 c23-modules-note@__stddef_nullptr_t.h:*{{declaration here is not visible}}
+static void f1(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}} \
+                                           c99-modules-error{{undeclared function}} c23-modules-error{{undeclared identifier}}
+max_align_t m1; // c99-error{{unknown type}} c23-error{{unknown type}} \
+                   c99-modules-error{{'max_align_t' must be declared before it is used}} c23-modules-error{{must be declared}} \
+                   c99-modules-note@__stddef_max_align_t.h:*{{declaration here is not visible}} c23-modules-note@__stddef_max_align_t.h:*{{declaration here is not visible}}
 size_t o1 = offsetof(struct astruct, member); // c99-error{{unknown type}} c99-error{{expected expression}} c99-error{{undeclared identifier}} \
-                                                 c23-error{{unknown type}} c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi1; // c99-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{unknown type}} c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi1; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
+
+// The "must be declared before used" errors are only emitted the first time a
+// known-but-not-visible type is seen. At this point the _Builtin_stddef module
+// has been built and all of the types tried, so most of the errors won't be
+// repeated below in modules. The types still aren't available, just the errors
+// aren't repeated. e.g. rsize_t still isn't available, if r1 above got deleted,
+// its error would move to r2 below.
 
 #define __need_size_t
 #include <stddef.h>
@@ -40,13 +76,19 @@ size_t s2;
 rsize_t r2; // c99-error{{unknown type}} c23-error{{unknown type}}
             // c99-note@__stddef_size_t.h:*{{'size_t' declared here}} c23-note@__stddef_size_t.h:*{{'size_t' declared here}}
 wchar_t wc2; // c99-error{{unknown type}} c23-error{{unknown type}}
-void *v2 = NULL; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}}
-nullptr_t n2; // c99-error{{unknown type}} c23-error{{unknown type}}
-static void f2(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}}
+void *v2 = NULL; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}} \
+                    c99-modules-error{{undeclared identifier}} c23-modules-error{{undeclared identifier}}
+nullptr_t n2; // c99-error{{unknown type}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}}
+static void f2(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}} \
+                                           c99-modules-error{{undeclared function}} c23-modules-error{{undeclared identifier}}
 max_align_t m2; // c99-error{{unknown type}} c23-error{{unknown type}}
 size_t o2 = offsetof(struct astruct, member); // c99-error{{expected expression}} c99-error{{undeclared identifier}} \
-                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi2; // c99-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi2; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #define __need_rsize_t
 #include <stddef.h>
@@ -55,13 +97,19 @@ ptrdiff_t p3;
 size_t s3;
 rsize_t r3;
 wchar_t wc3; // c99-error{{unknown type}} c23-error{{unknown type}}
-void *v3 = NULL; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}}
-nullptr_t n3; // c99-error{{unknown type}} c23-error{{unknown type}}
-static void f3(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}}
+void *v3 = NULL; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}} \
+                    c99-modules-error{{undeclared identifier}} c23-modules-error{{undeclared identifier}}
+nullptr_t n3; // c99-error{{unknown type}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}}
+static void f3(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}} \
+                                           c99-modules-error{{undeclared function}} c23-modules-error{{undeclared identifier}}
 max_align_t m3; // c99-error{{unknown type}} c23-error{{unknown type}}
 size_t o3 = offsetof(struct astruct, member); // c99-error{{expected expression}} c99-error{{undeclared identifier}} \
-                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi3; // c99-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi3; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #define __need_wchar_t
 #include <stddef.h>
@@ -70,13 +118,19 @@ ptrdiff_t p4;
 size_t s4;
 rsize_t r4;
 wchar_t wc4;
-void *v4 = NULL; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}}
-nullptr_t n4; // c99-error{{unknown type}} c23-error{{unknown type}}
-static void f4(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}}
+void *v4 = NULL; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}} \
+                    c99-modules-error{{undeclared identifier}} c23-modules-error{{undeclared identifier}}
+nullptr_t n4; // c99-error{{unknown type}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}}
+static void f4(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}} \
+                                           c99-modules-error{{undeclared function}} c23-modules-error{{undeclared identifier}}
 max_align_t m4; // c99-error{{unknown type}} c23-error{{unknown type}}
 size_t o4 = offsetof(struct astruct, member); // c99-error{{expected expression}} c99-error{{undeclared identifier}} \
-                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi4; // c99-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi4; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #define __need_NULL
 #include <stddef.h>
@@ -86,29 +140,38 @@ size_t s5;
 rsize_t r5;
 wchar_t wc5;
 void *v5 = NULL;
-nullptr_t n5; // c99-error{{unknown type}} c23-error{{unknown type}}
-static void f5(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}}
+nullptr_t n5; // c99-error{{unknown type}} c23-error{{unknown type}} \
+                 c99-modules-error{{unknown type}}
+static void f5(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}} \
+                                           c99-modules-error{{undeclared function}} c23-modules-error{{undeclared identifier}}
 max_align_t m5; // c99-error{{unknown type}} c23-error{{unknown type}}
 size_t o5 = offsetof(struct astruct, member); // c99-error{{expected expression}} c99-error{{undeclared identifier}} \
-                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi5; // c99-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi5; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
-// __need_nullptr_t generates an error in <C23 because its definition
+// nullptr_t doesn't get declared before C23 because its definition
 // depends on nullptr.
 #define __need_nullptr_t
-#include <stddef.h> // c99-error@__stddef_nullptr_t.h:*{{expected function body}}
+#include <stddef.h>
 
 ptrdiff_t p6;
 size_t s6;
 rsize_t r6;
 wchar_t wc6;
 void *v6 = NULL;
-nullptr_t n6; // c99-error{{unknown type}}
-static void f6(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}}
+nullptr_t n6; // c99-error{{unknown type}} c99-modules-error{{unknown type}}
+static void f6(void) { unreachable(); } // c99-error{{undeclared function}} c23-error{{undeclared identifier}} \
+                                           c99-modules-error{{undeclared function}} c23-modules-error{{undeclared identifier}}
 max_align_t m6; // c99-error{{unknown type}} c23-error{{unknown type}}
 size_t o6 = offsetof(struct astruct, member); // c99-error{{expected expression}} c99-error{{undeclared identifier}} \
-                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi6; // c99-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi6; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #define __need_unreachable
 #include <stddef.h>
@@ -118,12 +181,15 @@ size_t s7;
 rsize_t r7;
 wchar_t wc7;
 void *v7 = NULL;
-nullptr_t n7 ; // c99-error{{unknown type}}
+nullptr_t n7 ; // c99-error{{unknown type}} c99-modules-error{{unknown type}}
 static void f7(void) { unreachable(); }
 max_align_t m7; // c99-error{{unknown type}} c23-error{{unknown type}}
 size_t o7 = offsetof(struct astruct, member); // c99-error{{expected expression}} c99-error{{undeclared identifier}} \
-                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi7; // c99-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi7; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #define __need_max_align_t
 #include <stddef.h>
@@ -133,12 +199,15 @@ size_t s8;
 rsize_t r8;
 wchar_t wc8;
 void *v8 = NULL;
-nullptr_t n8; // c99-error{{unknown type}}
+nullptr_t n8; // c99-error{{unknown type}} c99-modules-error{{unknown type}}
 static void f8(void) { unreachable(); }
 max_align_t m8;
 size_t o8 = offsetof(struct astruct, member); // c99-error{{expected expression}} c99-error{{undeclared identifier}} \
-                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}}
-wint_t wi8; // c99-error{{unknown type}} c23-error{{unknown type}}
+                                                 c23-error{{undeclared identifier}} c23-error{{expected expression}} c23-error{{undeclared identifier}} \
+                                                 c99-modules-error{{expected expression}} c99-modules-error{{undeclared identifier}} \
+                                                 c23-modules-error{{undeclared identifier}} c23-modules-error{{expected expression}} c23-modules-error{{undeclared identifier}}
+wint_t wi8; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #define __need_offsetof
 #include <stddef.h>
@@ -146,13 +215,14 @@ wint_t wi8; // c99-error{{unknown type}} c23-error{{unknown type}}
 ptrdiff_t p9;
 size_t s9;
 rsize_t r9;
-nullptr_t n9; // c99-error{{unknown type}}
+nullptr_t n9; // c99-error{{unknown type}} c99-modules-error{{unknown type}}
 static void f9(void) { unreachable(); }
 wchar_t wc9;
 void *v9 = NULL;
 max_align_t m9;
 size_t o9 = offsetof(struct astruct, member);
-wint_t wi9; // c99-error{{unknown type}} c23-error{{unknown type}}
+wint_t wi9; // c99-error{{unknown type}} c23-error{{unknown type}} \
+               c99-modules-error{{unknown type}} c23-modules-error{{unknown type}}
 
 #define __need_wint_t
 #include <stddef.h>
@@ -162,7 +232,7 @@ size_t s10;
 rsize_t r10;
 wchar_t wc10;
 void *v10 = NULL;
-nullptr_t n10; // c99-error{{unknown type}}
+nullptr_t n10; // c99-error{{unknown type}} c99-modules-error{{unknown type}}
 static void f10(void) { unreachable(); }
 max_align_t m10;
 size_t o10 = offsetof(struct astruct, member);
diff --git a/clang/test/Interpreter/const.cpp b/clang/test/Interpreter/const.cpp
new file mode 100644
index 00000000000000..4b6ce65e3643e6
--- /dev/null
+++ b/clang/test/Interpreter/const.cpp
@@ -0,0 +1,32 @@
+// UNSUPPORTED: system-aix
+// see https://github.com/llvm/llvm-project/issues/68092
+// XFAIL: system-windows
+
+// RUN: cat %s | clang-repl | FileCheck %s
+// RUN: cat %s | clang-repl -Xcc -O2 | FileCheck %s
+
+extern "C" int printf(const char*, ...);
+
+struct A { int val; A(int v); ~A(); void f() const; };
+A::A(int v) : val(v) { printf("A(%d), this = %p\n", val, this); }
+A::~A() { printf("~A, this = %p, val = %d\n", this, val); }
+void A::f() const { printf("f: this = %p, val = %d\n", this, val); }
+
+const A a(1);
+// CHECK: A(1), this = [[THIS:.+]]
+// The constructor must only be called once!
+// CHECK-NOT: A(1)
+
+a.f();
+// CHECK-NEXT: f: this = [[THIS]], val = 1
+a.f();
+// CHECK-NEXT: f: this = [[THIS]], val = 1
+
+%quit
+// There must still be no other constructor!
+// CHECK-NOT: A(1)
+
+// At the end, we expect exactly one destructor call
+// CHECK: ~A
+// CHECK-SAME: this = [[THIS]], val = 1
+// CHECK-NOT: ~A
diff --git a/clang/test/Lexer/cxx1z-trigraphs.cpp b/clang/test/Lexer/cxx1z-trigraphs.cpp
index 2e47c05e9bd33b..6b39ff6716605b 100644
--- a/clang/test/Lexer/cxx1z-trigraphs.cpp
+++ b/clang/test/Lexer/cxx1z-trigraphs.cpp
@@ -21,7 +21,7 @@ error here;
 
 #if !ENABLED_TRIGRAPHS
 // expected-error@11 {{}} expected-warning@11 {{trigraph ignored}}
-// expected-error@13 {{failed}} expected-warning@13 {{trigraph ignored}} expected-note@13 {{evaluates to ''?' == '#''}}
+// expected-error@13 {{failed}} expected-warning@13 {{trigraph ignored}} expected-note@13 {{evaluates to ''?' (0x3F, 63) == '#' (0x23, 35)'}}
 // expected-error@16 {{}}
 // expected-error@20 {{}}
 #else
diff --git a/clang/test/Misc/misc-source-ranges.cpp b/clang/test/Misc/misc-source-ranges.cpp
new file mode 100644
index 00000000000000..7a9d9d057dac40
--- /dev/null
+++ b/clang/test/Misc/misc-source-ranges.cpp
@@ -0,0 +1,7 @@
+// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-print-source-range-info %s 2>&1 | FileCheck %s
+
+struct S {
+  char a : 12 - 12;
+};
+// CHECK: misc-source-ranges.cpp:[[@LINE-2]]:8:{[[@LINE-2]]:12-[[@LINE-2]]:19}
+
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index eaf6d34421bbe0..f48126775c8685 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -56,6 +56,7 @@
 // CHECK-NEXT: ConsumableAutoCast (SubjectMatchRule_record)
 // CHECK-NEXT: ConsumableSetOnRead (SubjectMatchRule_record)
 // CHECK-NEXT: Convergent (SubjectMatchRule_function)
+// CHECK-NEXT: CountedBy (SubjectMatchRule_field)
 // CHECK-NEXT: DLLExport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: DLLImport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: Destructor (SubjectMatchRule_function)
diff --git a/clang/test/Modules/Inputs/builtin-headers/builtin-modules.modulemap b/clang/test/Modules/Inputs/builtin-headers/builtin-modules.modulemap
new file mode 100644
index 00000000000000..0fbb454dbb0223
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/builtin-modules.modulemap
@@ -0,0 +1,29 @@
+module c_complex [system] {
+  header "complex.h"
+  export *
+}
+
+module c_float [system] {
+  header "float.h"
+  export *
+}
+
+module c_inttypes [system] {
+  header "inttypes.h"
+  export *
+}
+
+module c_limits [system] {
+  header "limits.h"
+  export *
+}
+
+module c_math [system] {
+  header "math.h"
+  export *
+}
+
+module c_stdint [system] {
+  header "stdint.h"
+  export *
+}
diff --git a/clang/test/Modules/Inputs/builtin-headers/c++/module.modulemap b/clang/test/Modules/Inputs/builtin-headers/c++/module.modulemap
new file mode 100644
index 00000000000000..c969f067a234f0
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/c++/module.modulemap
@@ -0,0 +1,4 @@
+module cpp_stdint [system] {
+  header "stdint.h"
+  export *
+}
diff --git a/clang/test/Modules/Inputs/builtin-headers/c++/stdint.h b/clang/test/Modules/Inputs/builtin-headers/c++/stdint.h
new file mode 100644
index 00000000000000..22cebacfdb64e4
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/c++/stdint.h
@@ -0,0 +1,6 @@
+#ifndef CPP_STDINT_H
+#define CPP_STDINT_H
+
+#include_next <stdint.h>
+
+#endif
diff --git a/clang/test/Modules/Inputs/builtin-headers/complex.h b/clang/test/Modules/Inputs/builtin-headers/complex.h
new file mode 100644
index 00000000000000..b6ef3c5a35b450
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/complex.h
@@ -0,0 +1 @@
+// Required by tgmath.h
diff --git a/clang/test/Modules/Inputs/builtin-headers/float.h b/clang/test/Modules/Inputs/builtin-headers/float.h
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/clang/test/Modules/Inputs/builtin-headers/inttypes.h b/clang/test/Modules/Inputs/builtin-headers/inttypes.h
new file mode 100644
index 00000000000000..ddb6bb83b57238
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/inttypes.h
@@ -0,0 +1,12 @@
+#ifndef INTTYPES_H
+#define INTTYPES_H
+
+// This creates an include cycle when inttypes.h and stdint.h
+// are both part of the cstd module. This include will resolve
+// to the C++ stdint.h, which will #include_next eventually to
+// the stdint.h in this directory, and thus create the cycle
+// cstd (inttypes.h) -> cpp_stdint (stdint.h) -> cstd (stdint.h).
+// This cycle is worked around by cstd using [no_undeclared_includes].
+#include <stdint.h>
+
+#endif
diff --git a/clang/test/Modules/Inputs/builtin-headers/limits.h b/clang/test/Modules/Inputs/builtin-headers/limits.h
new file mode 100644
index 00000000000000..8b137891791fe9
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/limits.h
@@ -0,0 +1 @@
+
diff --git a/clang/test/Modules/Inputs/builtin-headers/math.h b/clang/test/Modules/Inputs/builtin-headers/math.h
new file mode 100644
index 00000000000000..b6ef3c5a35b450
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/math.h
@@ -0,0 +1 @@
+// Required by tgmath.h
diff --git a/clang/test/Modules/Inputs/builtin-headers/stdint.h b/clang/test/Modules/Inputs/builtin-headers/stdint.h
new file mode 100644
index 00000000000000..8f233fd7f45d6d
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/stdint.h
@@ -0,0 +1,34 @@
+#ifndef STDINT_H
+#define STDINT_H
+
+// types needed by stdatomic.h
+
+typedef char int_least8_t;
+typedef short int_least16_t;
+typedef int int_least32_t;
+typedef long long int int_least64_t;
+typedef unsigned char uint_least8_t;
+typedef unsigned short uint_least16_t;
+typedef unsigned int uint_least32_t;
+typedef unsigned long long uint_least64_t;
+
+typedef char int_fast8_t;
+typedef short int_fast16_t;
+typedef int int_fast32_t;
+typedef long long int int_fast64_t;
+typedef unsigned char uint_fast8_t;
+typedef unsigned short uint_fast16_t;
+typedef unsigned int uint_fast32_t;
+typedef unsigned long long uint_fast64_t;
+
+typedef int intptr_t;
+typedef unsigned int uintptr_t;
+typedef int intmax_t;
+typedef unsigned int uintmax_t;
+
+// additional types for unwind.h
+
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+#endif
diff --git a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
new file mode 100644
index 00000000000000..0161ff80fe6189
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
@@ -0,0 +1,71 @@
+module cstd [system] [no_undeclared_includes] {
+  module complex {
+    header "complex.h"
+    export *
+  }
+
+  module float {
+    header "float.h"
+    export *
+  }
+
+  module inttypes {
+    header "inttypes.h"
+    export *
+  }
+
+  module iso646 {
+    header "iso646.h"
+    export *
+  }
+
+  module limits {
+    header "limits.h"
+    export *
+  }
+
+  module math {
+    header "math.h"
+    export *
+  }
+
+  module stdalign {
+    header "stdalign.h"
+    export *
+  }
+
+  module stdarg {
+    header "stdarg.h"
+    export *
+  }
+
+  module stdatomic {
+    header "stdatomic.h"
+    export *
+  }
+
+  module stdbool {
+    header "stdbool.h"
+    export *
+  }
+
+  module stddef {
+    header "stddef.h"
+    export *
+  }
+
+  module stdint {
+    header "stdint.h"
+    export *
+  }
+
+  module tgmath {
+    header "tgmath.h"
+    export *
+  }
+
+  module unwind {
+    header "unwind.h"
+    export *
+  }
+}
diff --git a/clang/test/Modules/builtin-headers.mm b/clang/test/Modules/builtin-headers.mm
new file mode 100644
index 00000000000000..4c9ddec4e99c28
--- /dev/null
+++ b/clang/test/Modules/builtin-headers.mm
@@ -0,0 +1,41 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -cxx-isystem %S/Inputs/builtin-headers/c++ -internal-isystem %S/Inputs/builtin-headers -fsyntax-only -fmodules -fmodules-cache-path=%t -fmodule-map-file=%S/Inputs/builtin-headers/c++/module.modulemap -fmodule-map-file=%resource_dir/module.modulemap -fmodule-map-file=%S/Inputs/builtin-headers/system-modules.modulemap -fbuiltin-headers-in-system-modules -DSYSTEM_MODULES %s -verify
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -cxx-isystem %S/Inputs/builtin-headers/c++ -internal-isystem %S/Inputs/builtin-headers -fsyntax-only -fmodules -fmodules-cache-path=%t -fmodule-map-file=%S/Inputs/builtin-headers/c++/module.modulemap -fmodule-map-file=%resource_dir/module.modulemap -fmodule-map-file=%S/Inputs/builtin-headers/builtin-modules.modulemap %s -verify
+
+// expected-no-diagnostics
+
+@import cpp_stdint;
+
+// The builtin modules are always available, though they're mostly
+// empty if -fbuiltin-headers-in-system-modules is used.
+@import _Builtin_float;
+@import _Builtin_inttypes;
+@import _Builtin_iso646;
+@import _Builtin_limits;
+@import _Builtin_stdalign;
+@import _Builtin_stdarg;
+@import _Builtin_stdatomic;
+@import _Builtin_stdbool;
+@import _Builtin_stddef;
+@import _Builtin_stdint;
+@import _Builtin_stdnoreturn;
+@import _Builtin_tgmath;
+@import _Builtin_unwind;
+
+#ifdef SYSTEM_MODULES
+// system-modules.modulemap uses the "mega module" style with
+// -fbuiltin-headers-in-system-modules, and its modules cover
+// the clang builtin headers.
+@import cstd;
+#else
+// builtin-modules.modulemap uses top level modules for each
+// of its headers, which allows interleaving with the builtin
+// modules and libc++ modules.
+@import c_complex;
+@import c_float;
+@import c_inttypes;
+@import c_limits;
+@import c_math;
+@import c_stdint;
+#endif
diff --git a/clang/test/Modules/module-init-duplicated-import.cppm b/clang/test/Modules/module-init-duplicated-import.cppm
new file mode 100644
index 00000000000000..de0ce1962f1008
--- /dev/null
+++ b/clang/test/Modules/module-init-duplicated-import.cppm
@@ -0,0 +1,27 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/a.cppm \
+// RUN:      -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.cppm \
+// RUN:      -emit-module-interface -fmodule-file=a=%t/a.pcm -o %t/m.pcm
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.pcm  \
+// RUN:     -S -emit-llvm -o - | FileCheck %t/m.cppm
+
+//--- a.cppm
+export module a;
+export struct A {
+  A(){};
+};
+export A __dynamic_inited_a;
+
+//--- m.cppm
+export module m;
+import a;
+export import a;
+
+
+// CHECK: define void @_ZGIW1m
+// CHECK: store i8 1, ptr @_ZGIW1m__in_chrg
+// CHECK: call{{.*}}@_ZGIW1a
diff --git a/clang/test/Modules/pr67893.cppm b/clang/test/Modules/pr67893.cppm
new file mode 100644
index 00000000000000..7d4e4c1dc5d843
--- /dev/null
+++ b/clang/test/Modules/pr67893.cppm
@@ -0,0 +1,29 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/a.cppm \
+// RUN:      -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.cppm \
+// RUN:      -emit-module-interface -fmodule-file=a=%t/a.pcm -o %t/m.pcm
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.pcm  \
+// RUN:     -S -emit-llvm -o - | FileCheck %t/m.cppm
+
+//--- a.cppm
+export module a;
+export struct A {
+  A(){};
+};
+export A __dynamic_inited_a;
+
+//--- m.cppm
+module;
+import a;
+export module m;
+import a;
+module :private;
+import a;
+
+// CHECK: define void @_ZGIW1m
+// CHECK: store i8 1, ptr @_ZGIW1m__in_chrg
+// CHECK: call{{.*}}@_ZGIW1a
diff --git a/clang/test/Modules/stddef.c b/clang/test/Modules/stddef.c
index 6704abffbf0b7d..5bc0d1e44c8563 100644
--- a/clang/test/Modules/stddef.c
+++ b/clang/test/Modules/stddef.c
@@ -1,13 +1,29 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify -fno-modules-error-recovery
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify -fno-modules-error-recovery
 
 #include "ptrdiff_t.h"
 
 ptrdiff_t pdt;
 
-size_t st; // expected-error {{missing '#include "include_again.h"'; 'size_t' must be declared before it is used}}
-// expected-note@__stddef_size_t.h:* {{here}}
+// size_t is declared in both size_t.h and __stddef_size_t.h, both of which are
+// modular headers. Regardless of whether stddef.h joins the StdDef test module
+// or is in its _Builtin_stddef module, __stddef_size_t.h will be in
+// _Builtin_stddef.size_t. It's not defined which module will win as the expected
+// provider of size_t. For the purposes of this test it doesn't matter which header
+// gets reported, just as long as it isn't other.h or include_again.h.
+size_t st; // expected-error-re {{missing '#include "{{size_t|__stddef_size_t}}.h"'; 'size_t' must be declared before it is used}}
+// expected-note@size_t.h:* 0+ {{here}}
+// expected-note@__stddef_size_t.h:* 0+ {{here}}
 
 #include "include_again.h"
+// Includes <stddef.h> which includes <__stddef_size_t.h> which imports the
+// _Builtin_stddef.size_t module.
 
 size_t st2;
+
+#include "size_t.h"
+// Redeclares size_t, but the type merger should figure it out.
+
+size_t st3;
diff --git a/clang/test/Modules/stddef.m b/clang/test/Modules/stddef.m
index 1ff6ff73796abe..855464afe61af1 100644
--- a/clang/test/Modules/stddef.m
+++ b/clang/test/Modules/stddef.m
@@ -4,4 +4,6 @@
 
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -I %S/Inputs/StdDef %s -verify
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs/StdDef %s -verify
 // expected-no-diagnostics
diff --git a/clang/test/OpenMP/declare_target_codegen.cpp b/clang/test/OpenMP/declare_target_codegen.cpp
index 71c742198af6bf..225695feae9515 100644
--- a/clang/test/OpenMP/declare_target_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_codegen.cpp
@@ -31,7 +31,7 @@
 // CHECK-DAG: @dy = {{protected | }}global i32 0,
 // CHECK-DAG: @bbb = {{protected | }}global i32 0,
 // CHECK-DAG: weak constant %struct.__tgt_offload_entry { ptr @bbb,
-// CHECK-DAG: @ccc = external global i32,
+// CHECK-DAG: @ccc = external {{protected | }}global i32,
 // CHECK-DAG: @ddd = {{protected | }}global i32 0,
 // CHECK-DAG: @hhh_decl_tgt_ref_ptr = weak global ptr null
 // CHECK-DAG: @ggg_decl_tgt_ref_ptr = weak global ptr null
diff --git a/clang/test/OpenMP/declare_target_constexpr_codegen.cpp b/clang/test/OpenMP/declare_target_constexpr_codegen.cpp
index 0acd98129394b8..2b256cd6a4c7f0 100644
--- a/clang/test/OpenMP/declare_target_constexpr_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_constexpr_codegen.cpp
@@ -16,7 +16,7 @@ class A {
 public:
   static constexpr double pi = 3.141592653589793116;
 //.
-// CHECK: @_ZN1A2piE = linkonce_odr constant double 0x400921FB54442D18, comdat, align 8
+// CHECK: @_ZN1A2piE = linkonce_odr protected constant double 0x400921FB54442D18, comdat, align 8
 // CHECK: @_ZL9anotherPi = internal constant double 3.140000e+00, align 8
 // CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr @"__ZN1A2piE$ref", ptr @"__ZL9anotherPi$ref"], section "llvm.metadata"
 //.
diff --git a/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp
new file mode 100644
index 00000000000000..e0fe9eb76a6f17
--- /dev/null
+++ b/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp
@@ -0,0 +1,56 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+
+  #pragma omp target teams ompx_bare
+  {
+    a = 2;
+  }
+
+  return a;
+}
+
+int bar(int n){
+  int a = 0;
+
+  a += ftemplate<char>(n);
+
+  return a;
+}
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l13
+// CHECK-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A_CASTED]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l13_omp_outlined(ptr null, ptr [[DOTZERO_ADDR]], i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l13_omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store i8 2, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/ompx_bare_messages.c b/clang/test/OpenMP/ompx_bare_messages.c
new file mode 100644
index 00000000000000..a1b3c380285287
--- /dev/null
+++ b/clang/test/OpenMP/ompx_bare_messages.c
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown %s
+ // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-unknown %s
+ // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 %s
+
+void foo() {
+}
+
+void bar() {
+#pragma omp target ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp target'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
+  foo();
+
+#pragma omp target teams distribute ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp target teams distribute'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
+  for (int i = 0; i < 10; ++i) {}
+
+#pragma omp target teams distribute parallel for ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp target teams distribute parallel for'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
+  for (int i = 0; i < 10; ++i) {}
+
+#pragma omp target
+#pragma omp teams ompx_bare // expected-error {{unexpected OpenMP clause 'ompx_bare' in directive '#pragma omp teams'}} expected-note {{OpenMP extension clause 'ompx_bare' only allowed with '#pragma omp target teams'}}
+  foo();
+}
diff --git a/clang/test/OpenMP/target_teams_ast_print.cpp b/clang/test/OpenMP/target_teams_ast_print.cpp
index 19943a81eab8ef..5f1040be01a25f 100644
--- a/clang/test/OpenMP/target_teams_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_ast_print.cpp
@@ -111,6 +111,10 @@ int main (int argc, char **argv) {
 // CHECK-NEXT: #pragma omp target teams
   a=2;
 // CHECK-NEXT: a = 2;
+#pragma omp target teams ompx_bare
+// CHECK-NEXT: #pragma omp target teams ompx_bare
+  a=3;
+// CHECK-NEXT: a = 3;
 #pragma omp target teams default(none), private(argc,b) num_teams(f) firstprivate(argv) reduction(| : c, d) reduction(* : e) thread_limit(f+g)
 // CHECK-NEXT: #pragma omp target teams default(none) private(argc,b) num_teams(f) firstprivate(argv) reduction(|: c,d) reduction(*: e) thread_limit(f + g)
   foo();
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp
index 5c32a79bb64b04..3185d90b5ef14c 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -121,6 +121,12 @@ int foo(int n) {
     aa += 1;
   }
 
+  #pragma omp target teams ompx_bare
+  {
+    a += 1;
+    aa += 1;
+  }
+
   // We capture 3 VLA sizes in this target region
 
 
@@ -336,22 +342,28 @@ int bar(int n){
 // CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [2 x ptr], align 8
 // CHECK1-NEXT:    [[KERNEL_ARGS13:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[A_CASTED16:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS19:%.*]] = alloca [9 x ptr], align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS20:%.*]] = alloca [9 x ptr], align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS21:%.*]] = alloca [9 x ptr], align 8
+// CHECK1-NEXT:    [[AA_CASTED17:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS18:%.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS19:%.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS20:%.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS21:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    [[A_CASTED24:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [9 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [9 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [9 x ptr], align 8
 // CHECK1-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 8
-// CHECK1-NEXT:    [[KERNEL_ARGS22:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[NN:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[NN_CASTED:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
-// CHECK1-NEXT:    [[NN_CASTED33:%.*]] = alloca i64, align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS34:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS35:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS36:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    [[KERNEL_ARGS37:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS35:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS36:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS37:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS38:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    [[NN_CASTED41:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS42:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS43:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS44:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS45:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
@@ -538,206 +550,259 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP95:%.*]] = load i32, ptr [[A]], align 4
 // CHECK1-NEXT:    store i32 [[TMP95]], ptr [[A_CASTED16]], align 4
 // CHECK1-NEXT:    [[TMP96:%.*]] = load i64, ptr [[A_CASTED16]], align 8
-// CHECK1-NEXT:    [[TMP97:%.*]] = load i32, ptr [[N_ADDR]], align 4
-// CHECK1-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[TMP97]], 20
-// CHECK1-NEXT:    br i1 [[CMP17]], label [[OMP_IF_THEN18:%.*]], label [[OMP_IF_ELSE25:%.*]]
-// CHECK1:       omp_if.then18:
-// CHECK1-NEXT:    [[TMP98:%.*]] = mul nuw i64 [[TMP2]], 4
-// CHECK1-NEXT:    [[TMP99:%.*]] = mul nuw i64 5, [[TMP5]]
-// CHECK1-NEXT:    [[TMP100:%.*]] = mul nuw i64 [[TMP99]], 8
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes.5, i64 72, i1 false)
-// CHECK1-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
-// CHECK1-NEXT:    store i64 [[TMP96]], ptr [[TMP101]], align 8
-// CHECK1-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
-// CHECK1-NEXT:    store i64 [[TMP96]], ptr [[TMP102]], align 8
-// CHECK1-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr null, ptr [[TMP103]], align 8
-// CHECK1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 1
-// CHECK1-NEXT:    store ptr [[B]], ptr [[TMP104]], align 8
-// CHECK1-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 1
-// CHECK1-NEXT:    store ptr [[B]], ptr [[TMP105]], align 8
-// CHECK1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr null, ptr [[TMP106]], align 8
-// CHECK1-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 2
-// CHECK1-NEXT:    store i64 [[TMP2]], ptr [[TMP107]], align 8
-// CHECK1-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 2
-// CHECK1-NEXT:    store i64 [[TMP2]], ptr [[TMP108]], align 8
-// CHECK1-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr null, ptr [[TMP109]], align 8
-// CHECK1-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP110]], align 8
-// CHECK1-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP111]], align 8
-// CHECK1-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 3
-// CHECK1-NEXT:    store i64 [[TMP98]], ptr [[TMP112]], align 8
-// CHECK1-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 3
+// CHECK1-NEXT:    [[TMP97:%.*]] = load i16, ptr [[AA]], align 2
+// CHECK1-NEXT:    store i16 [[TMP97]], ptr [[AA_CASTED17]], align 2
+// CHECK1-NEXT:    [[TMP98:%.*]] = load i64, ptr [[AA_CASTED17]], align 8
+// CHECK1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP96]], ptr [[TMP99]], align 8
+// CHECK1-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP96]], ptr [[TMP100]], align 8
+// CHECK1-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS20]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP101]], align 8
+// CHECK1-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP98]], ptr [[TMP102]], align 8
+// CHECK1-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP98]], ptr [[TMP103]], align 8
+// CHECK1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS20]], i64 0, i64 1
+// CHECK1-NEXT:    store ptr null, ptr [[TMP104]], align 8
+// CHECK1-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, ptr [[TMP107]], align 4
+// CHECK1-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 2, ptr [[TMP108]], align 4
+// CHECK1-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP105]], ptr [[TMP109]], align 8
+// CHECK1-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP106]], ptr [[TMP110]], align 8
+// CHECK1-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP111]], align 8
+// CHECK1-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP112]], align 8
+// CHECK1-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 6
 // CHECK1-NEXT:    store ptr null, ptr [[TMP113]], align 8
-// CHECK1-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr [[C]], ptr [[TMP114]], align 8
-// CHECK1-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr [[C]], ptr [[TMP115]], align 8
-// CHECK1-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 4
-// CHECK1-NEXT:    store ptr null, ptr [[TMP116]], align 8
-// CHECK1-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 5
-// CHECK1-NEXT:    store i64 5, ptr [[TMP117]], align 8
-// CHECK1-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 5
-// CHECK1-NEXT:    store i64 5, ptr [[TMP118]], align 8
-// CHECK1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 5
-// CHECK1-NEXT:    store ptr null, ptr [[TMP119]], align 8
-// CHECK1-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 6
-// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP120]], align 8
-// CHECK1-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 6
-// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP121]], align 8
-// CHECK1-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP122]], align 8
-// CHECK1-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr [[VLA1]], ptr [[TMP123]], align 8
-// CHECK1-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr [[VLA1]], ptr [[TMP124]], align 8
-// CHECK1-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 7
-// CHECK1-NEXT:    store i64 [[TMP100]], ptr [[TMP125]], align 8
-// CHECK1-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP126]], align 8
-// CHECK1-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 8
-// CHECK1-NEXT:    store ptr [[D]], ptr [[TMP127]], align 8
-// CHECK1-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 8
-// CHECK1-NEXT:    store ptr [[D]], ptr [[TMP128]], align 8
-// CHECK1-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i64 0, i64 8
-// CHECK1-NEXT:    store ptr null, ptr [[TMP129]], align 8
-// CHECK1-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP133]], align 4
-// CHECK1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 9, ptr [[TMP134]], align 4
-// CHECK1-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr [[TMP130]], ptr [[TMP135]], align 8
-// CHECK1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[TMP131]], ptr [[TMP136]], align 8
-// CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr [[TMP132]], ptr [[TMP137]], align 8
-// CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP138]], align 8
-// CHECK1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP139]], align 8
-// CHECK1-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 7
+// CHECK1-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP114]], align 8
+// CHECK1-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP115]], align 8
+// CHECK1-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, ptr [[TMP116]], align 8
+// CHECK1-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP117]], align 4
+// CHECK1-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP118]], align 4
+// CHECK1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, ptr [[TMP119]], align 4
+// CHECK1-NEXT:    [[TMP120:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.region_id, ptr [[KERNEL_ARGS21]])
+// CHECK1-NEXT:    [[TMP121:%.*]] = icmp ne i32 [[TMP120]], 0
+// CHECK1-NEXT:    br i1 [[TMP121]], label [[OMP_OFFLOAD_FAILED22:%.*]], label [[OMP_OFFLOAD_CONT23:%.*]]
+// CHECK1:       omp_offload.failed22:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124(i64 [[TMP96]], i64 [[TMP98]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT23]]
+// CHECK1:       omp_offload.cont23:
+// CHECK1-NEXT:    [[TMP122:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    store i32 [[TMP122]], ptr [[A_CASTED24]], align 4
+// CHECK1-NEXT:    [[TMP123:%.*]] = load i64, ptr [[A_CASTED24]], align 8
+// CHECK1-NEXT:    [[TMP124:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[TMP124]], 20
+// CHECK1-NEXT:    br i1 [[CMP25]], label [[OMP_IF_THEN26:%.*]], label [[OMP_IF_ELSE33:%.*]]
+// CHECK1:       omp_if.then26:
+// CHECK1-NEXT:    [[TMP125:%.*]] = mul nuw i64 [[TMP2]], 4
+// CHECK1-NEXT:    [[TMP126:%.*]] = mul nuw i64 5, [[TMP5]]
+// CHECK1-NEXT:    [[TMP127:%.*]] = mul nuw i64 [[TMP126]], 8
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes.7, i64 72, i1 false)
+// CHECK1-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP123]], ptr [[TMP128]], align 8
+// CHECK1-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP123]], ptr [[TMP129]], align 8
+// CHECK1-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP130]], align 8
+// CHECK1-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 1
+// CHECK1-NEXT:    store ptr [[B]], ptr [[TMP131]], align 8
+// CHECK1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 1
+// CHECK1-NEXT:    store ptr [[B]], ptr [[TMP132]], align 8
+// CHECK1-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 1
+// CHECK1-NEXT:    store ptr null, ptr [[TMP133]], align 8
+// CHECK1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP2]], ptr [[TMP134]], align 8
+// CHECK1-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP2]], ptr [[TMP135]], align 8
+// CHECK1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP136]], align 8
+// CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP137]], align 8
+// CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP138]], align 8
+// CHECK1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK1-NEXT:    store i64 [[TMP125]], ptr [[TMP139]], align 8
+// CHECK1-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 3
 // CHECK1-NEXT:    store ptr null, ptr [[TMP140]], align 8
-// CHECK1-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP141]], align 8
-// CHECK1-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP142]], align 8
-// CHECK1-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP143]], align 4
-// CHECK1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP144]], align 4
-// CHECK1-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP145]], align 4
-// CHECK1-NEXT:    [[TMP146:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.region_id, ptr [[KERNEL_ARGS22]])
-// CHECK1-NEXT:    [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0
-// CHECK1-NEXT:    br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED23:%.*]], label [[OMP_OFFLOAD_CONT24:%.*]]
-// CHECK1:       omp_offload.failed23:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
-// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT24]]
-// CHECK1:       omp_offload.cont24:
-// CHECK1-NEXT:    br label [[OMP_IF_END26:%.*]]
-// CHECK1:       omp_if.else25:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
-// CHECK1-NEXT:    br label [[OMP_IF_END26]]
-// CHECK1:       omp_if.end26:
-// CHECK1-NEXT:    store i32 0, ptr [[NN]], align 4
-// CHECK1-NEXT:    [[TMP148:%.*]] = load i32, ptr [[NN]], align 4
-// CHECK1-NEXT:    store i32 [[TMP148]], ptr [[NN_CASTED]], align 4
-// CHECK1-NEXT:    [[TMP149:%.*]] = load i64, ptr [[NN_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
-// CHECK1-NEXT:    store i64 [[TMP149]], ptr [[TMP150]], align 8
-// CHECK1-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
-// CHECK1-NEXT:    store i64 [[TMP149]], ptr [[TMP151]], align 8
-// CHECK1-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr null, ptr [[TMP152]], align 8
-// CHECK1-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP155]], align 4
-// CHECK1-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 1, ptr [[TMP156]], align 4
-// CHECK1-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr [[TMP153]], ptr [[TMP157]], align 8
-// CHECK1-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[TMP154]], ptr [[TMP158]], align 8
-// CHECK1-NEXT:    [[TMP159:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP159]], align 8
-// CHECK1-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP160]], align 8
-// CHECK1-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP161]], align 8
-// CHECK1-NEXT:    [[TMP162:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP162]], align 8
-// CHECK1-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP163]], align 8
-// CHECK1-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP164]], align 8
-// CHECK1-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP165]], align 4
-// CHECK1-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP166]], align 4
-// CHECK1-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP167]], align 4
-// CHECK1-NEXT:    [[TMP168:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.region_id, ptr [[KERNEL_ARGS30]])
-// CHECK1-NEXT:    [[TMP169:%.*]] = icmp ne i32 [[TMP168]], 0
-// CHECK1-NEXT:    br i1 [[TMP169]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
+// CHECK1-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr [[C]], ptr [[TMP141]], align 8
+// CHECK1-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr [[C]], ptr [[TMP142]], align 8
+// CHECK1-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 4
+// CHECK1-NEXT:    store ptr null, ptr [[TMP143]], align 8
+// CHECK1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 5
+// CHECK1-NEXT:    store i64 5, ptr [[TMP144]], align 8
+// CHECK1-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 5
+// CHECK1-NEXT:    store i64 5, ptr [[TMP145]], align 8
+// CHECK1-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 5
+// CHECK1-NEXT:    store ptr null, ptr [[TMP146]], align 8
+// CHECK1-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 6
+// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP147]], align 8
+// CHECK1-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 6
+// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP148]], align 8
+// CHECK1-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP149]], align 8
+// CHECK1-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr [[VLA1]], ptr [[TMP150]], align 8
+// CHECK1-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr [[VLA1]], ptr [[TMP151]], align 8
+// CHECK1-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK1-NEXT:    store i64 [[TMP127]], ptr [[TMP152]], align 8
+// CHECK1-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP153]], align 8
+// CHECK1-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 8
+// CHECK1-NEXT:    store ptr [[D]], ptr [[TMP154]], align 8
+// CHECK1-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 8
+// CHECK1-NEXT:    store ptr [[D]], ptr [[TMP155]], align 8
+// CHECK1-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i64 0, i64 8
+// CHECK1-NEXT:    store ptr null, ptr [[TMP156]], align 8
+// CHECK1-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP159:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, ptr [[TMP160]], align 4
+// CHECK1-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 9, ptr [[TMP161]], align 4
+// CHECK1-NEXT:    [[TMP162:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP157]], ptr [[TMP162]], align 8
+// CHECK1-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP158]], ptr [[TMP163]], align 8
+// CHECK1-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr [[TMP159]], ptr [[TMP164]], align 8
+// CHECK1-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP165]], align 8
+// CHECK1-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP166]], align 8
+// CHECK1-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP167]], align 8
+// CHECK1-NEXT:    [[TMP168:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP168]], align 8
+// CHECK1-NEXT:    [[TMP169:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, ptr [[TMP169]], align 8
+// CHECK1-NEXT:    [[TMP170:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP170]], align 4
+// CHECK1-NEXT:    [[TMP171:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP171]], align 4
+// CHECK1-NEXT:    [[TMP172:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, ptr [[TMP172]], align 4
+// CHECK1-NEXT:    [[TMP173:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.region_id, ptr [[KERNEL_ARGS30]])
+// CHECK1-NEXT:    [[TMP174:%.*]] = icmp ne i32 [[TMP173]], 0
+// CHECK1-NEXT:    br i1 [[TMP174]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
 // CHECK1:       omp_offload.failed31:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154(i64 [[TMP149]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148(i64 [[TMP123]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT32]]
 // CHECK1:       omp_offload.cont32:
-// CHECK1-NEXT:    [[TMP170:%.*]] = load i32, ptr [[NN]], align 4
-// CHECK1-NEXT:    store i32 [[TMP170]], ptr [[NN_CASTED33]], align 4
-// CHECK1-NEXT:    [[TMP171:%.*]] = load i64, ptr [[NN_CASTED33]], align 8
-// CHECK1-NEXT:    [[TMP172:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS34]], i32 0, i32 0
-// CHECK1-NEXT:    store i64 [[TMP171]], ptr [[TMP172]], align 8
-// CHECK1-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS35]], i32 0, i32 0
-// CHECK1-NEXT:    store i64 [[TMP171]], ptr [[TMP173]], align 8
-// CHECK1-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS36]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr null, ptr [[TMP174]], align 8
-// CHECK1-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS34]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS35]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP177]], align 4
-// CHECK1-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 1, ptr [[TMP178]], align 4
-// CHECK1-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr [[TMP175]], ptr [[TMP179]], align 8
-// CHECK1-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[TMP176]], ptr [[TMP180]], align 8
-// CHECK1-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP181]], align 8
-// CHECK1-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP182]], align 8
-// CHECK1-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP183]], align 8
-// CHECK1-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP184]], align 8
-// CHECK1-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP185]], align 8
-// CHECK1-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 0, ptr [[TMP186]], align 8
-// CHECK1-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP187]], align 4
-// CHECK1-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP188]], align 4
-// CHECK1-NEXT:    [[TMP189:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP189]], align 4
-// CHECK1-NEXT:    [[TMP190:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.region_id, ptr [[KERNEL_ARGS37]])
-// CHECK1-NEXT:    [[TMP191:%.*]] = icmp ne i32 [[TMP190]], 0
-// CHECK1-NEXT:    br i1 [[TMP191]], label [[OMP_OFFLOAD_FAILED38:%.*]], label [[OMP_OFFLOAD_CONT39:%.*]]
-// CHECK1:       omp_offload.failed38:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157(i64 [[TMP171]]) #[[ATTR3]]
-// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT39]]
-// CHECK1:       omp_offload.cont39:
-// CHECK1-NEXT:    [[TMP192:%.*]] = load i32, ptr [[A]], align 4
-// CHECK1-NEXT:    [[TMP193:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8
-// CHECK1-NEXT:    call void @llvm.stackrestore.p0(ptr [[TMP193]])
-// CHECK1-NEXT:    ret i32 [[TMP192]]
+// CHECK1-NEXT:    br label [[OMP_IF_END34:%.*]]
+// CHECK1:       omp_if.else33:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148(i64 [[TMP123]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_IF_END34]]
+// CHECK1:       omp_if.end34:
+// CHECK1-NEXT:    store i32 0, ptr [[NN]], align 4
+// CHECK1-NEXT:    [[TMP175:%.*]] = load i32, ptr [[NN]], align 4
+// CHECK1-NEXT:    store i32 [[TMP175]], ptr [[NN_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP176:%.*]] = load i64, ptr [[NN_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP176]], ptr [[TMP177]], align 8
+// CHECK1-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP176]], ptr [[TMP178]], align 8
+// CHECK1-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP179]], align 8
+// CHECK1-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, ptr [[TMP182]], align 4
+// CHECK1-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 1, ptr [[TMP183]], align 4
+// CHECK1-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP180]], ptr [[TMP184]], align 8
+// CHECK1-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP181]], ptr [[TMP185]], align 8
+// CHECK1-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP186]], align 8
+// CHECK1-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP187]], align 8
+// CHECK1-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP188]], align 8
+// CHECK1-NEXT:    [[TMP189:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP189]], align 8
+// CHECK1-NEXT:    [[TMP190:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP190]], align 8
+// CHECK1-NEXT:    [[TMP191:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, ptr [[TMP191]], align 8
+// CHECK1-NEXT:    [[TMP192:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP192]], align 4
+// CHECK1-NEXT:    [[TMP193:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP193]], align 4
+// CHECK1-NEXT:    [[TMP194:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, ptr [[TMP194]], align 4
+// CHECK1-NEXT:    [[TMP195:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.region_id, ptr [[KERNEL_ARGS38]])
+// CHECK1-NEXT:    [[TMP196:%.*]] = icmp ne i32 [[TMP195]], 0
+// CHECK1-NEXT:    br i1 [[TMP196]], label [[OMP_OFFLOAD_FAILED39:%.*]], label [[OMP_OFFLOAD_CONT40:%.*]]
+// CHECK1:       omp_offload.failed39:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP176]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT40]]
+// CHECK1:       omp_offload.cont40:
+// CHECK1-NEXT:    [[TMP197:%.*]] = load i32, ptr [[NN]], align 4
+// CHECK1-NEXT:    store i32 [[TMP197]], ptr [[NN_CASTED41]], align 4
+// CHECK1-NEXT:    [[TMP198:%.*]] = load i64, ptr [[NN_CASTED41]], align 8
+// CHECK1-NEXT:    [[TMP199:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS42]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP198]], ptr [[TMP199]], align 8
+// CHECK1-NEXT:    [[TMP200:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS43]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP198]], ptr [[TMP200]], align 8
+// CHECK1-NEXT:    [[TMP201:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS44]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP201]], align 8
+// CHECK1-NEXT:    [[TMP202:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS42]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP203:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS43]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP204:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 2, ptr [[TMP204]], align 4
+// CHECK1-NEXT:    [[TMP205:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 1, ptr [[TMP205]], align 4
+// CHECK1-NEXT:    [[TMP206:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP202]], ptr [[TMP206]], align 8
+// CHECK1-NEXT:    [[TMP207:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP203]], ptr [[TMP207]], align 8
+// CHECK1-NEXT:    [[TMP208:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP208]], align 8
+// CHECK1-NEXT:    [[TMP209:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP209]], align 8
+// CHECK1-NEXT:    [[TMP210:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP210]], align 8
+// CHECK1-NEXT:    [[TMP211:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP211]], align 8
+// CHECK1-NEXT:    [[TMP212:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP212]], align 8
+// CHECK1-NEXT:    [[TMP213:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, ptr [[TMP213]], align 8
+// CHECK1-NEXT:    [[TMP214:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP214]], align 4
+// CHECK1-NEXT:    [[TMP215:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP215]], align 4
+// CHECK1-NEXT:    [[TMP216:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 0, ptr [[TMP216]], align 4
+// CHECK1-NEXT:    [[TMP217:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.region_id, ptr [[KERNEL_ARGS45]])
+// CHECK1-NEXT:    [[TMP218:%.*]] = icmp ne i32 [[TMP217]], 0
+// CHECK1-NEXT:    br i1 [[TMP218]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]]
+// CHECK1:       omp_offload.failed46:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163(i64 [[TMP198]]) #[[ATTR3]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT47]]
+// CHECK1:       omp_offload.cont47:
+// CHECK1-NEXT:    [[TMP219:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP220:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8
+// CHECK1-NEXT:    call void @llvm.stackrestore.p0(ptr [[TMP220]])
+// CHECK1-NEXT:    ret i32 [[TMP219]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101
@@ -830,68 +895,68 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !25
 // CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !24
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !25
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !25
+// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !24
+// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias !24
+// CHECK1-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP23]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP23]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP24]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP24]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP25]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP25]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP26]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP26]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP27]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr null, ptr [[TMP27]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP28]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr null, ptr [[TMP28]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP29]], align 8, !noalias !24
+// CHECK1-NEXT:    store i64 0, ptr [[TMP29]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 1, ptr [[TMP30]], align 8, !noalias !24
+// CHECK1-NEXT:    store i64 1, ptr [[TMP30]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] [[TMP20]], ptr [[TMP31]], align 4, !noalias !24
+// CHECK1-NEXT:    store [3 x i32] [[TMP20]], ptr [[TMP31]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] [[TMP21]], ptr [[TMP32]], align 4, !noalias !24
+// CHECK1-NEXT:    store [3 x i32] [[TMP21]], ptr [[TMP32]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP33]], align 4, !noalias !24
+// CHECK1-NEXT:    store i32 0, ptr [[TMP33]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP18]], i32 [[TMP19]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK1-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK1-NEXT:    store i16 [[TMP36]], ptr [[AA_CASTED_I]], align 2, !noalias !24
-// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store i16 [[TMP36]], ptr [[AA_CASTED_I]], align 2, !noalias !25
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK1-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !24
-// CHECK1-NEXT:    [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !25
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK1-NEXT:    store i32 [[TMP40]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !24
-// CHECK1-NEXT:    [[TMP41:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store i32 [[TMP40]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !25
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !25
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i64 [[TMP37]], i64 [[TMP39]], i64 [[TMP41]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK1:       .omp_outlined..exit:
@@ -997,7 +1062,48 @@ int bar(int n){
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[AA_CASTED]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.omp_outlined, i64 [[TMP1]], i64 [[TMP3]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK1-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK1-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK1-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148
 // CHECK1-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -1030,11 +1136,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[A_CASTED]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 9, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined, i64 [[TMP9]], ptr [[TMP0]], i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP4]], i64 [[TMP5]], ptr [[TMP6]], ptr [[TMP7]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 9, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.omp_outlined, i64 [[TMP9]], ptr [[TMP0]], i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP4]], i64 [[TMP5]], ptr [[TMP6]], ptr [[TMP7]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1106,7 +1212,7 @@ int bar(int n){
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160
 // CHECK1-SAME: (i64 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[NN_ADDR:%.*]] = alloca i64, align 8
@@ -1115,11 +1221,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[NN_CASTED]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined, i64 [[TMP1]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined, i64 [[TMP1]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1132,11 +1238,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[NN_CASTED]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined.omp_outlined, i64 [[TMP1]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined.omp_outlined, i64 [[TMP1]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1148,7 +1254,7 @@ int bar(int n){
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163
 // CHECK1-SAME: (i64 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[NN_ADDR:%.*]] = alloca i64, align 8
@@ -1157,11 +1263,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[NN_CASTED]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined, i64 [[TMP1]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined, i64 [[TMP1]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1170,11 +1276,11 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[NN]], ptr [[NN_ADDR]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined.omp_outlined, ptr [[NN_ADDR]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined.omp_outlined, ptr [[NN_ADDR]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[NN:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1217,9 +1323,9 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
 // CHECK1-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP11]], align 8
+// CHECK1-NEXT:    store ptr @.offload_sizes.13, ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP12]], align 8
+// CHECK1-NEXT:    store ptr @.offload_maptypes.14, ptr [[TMP12]], align 8
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
@@ -1234,27 +1340,27 @@ int bar(int n){
 // CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
 // CHECK1-NEXT:    store i32 0, ptr [[TMP19]], align 4
-// CHECK1-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.region_id, ptr [[KERNEL_ARGS]])
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182(i64 [[TMP1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188(i64 [[TMP1]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188
 // CHECK1-SAME: (i64 noundef [[VLA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.omp_outlined, i64 [[TMP0]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined, i64 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1337,7 +1443,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = mul nuw i64 2, [[TMP2]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes.13, i64 40, i1 false)
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes.15, i64 40, i1 false)
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[THIS1]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -1384,7 +1490,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
 // CHECK1-NEXT:    store ptr [[TMP28]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes.14, ptr [[TMP34]], align 8
+// CHECK1-NEXT:    store ptr @.offload_maptypes.16, ptr [[TMP34]], align 8
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK1-NEXT:    store ptr null, ptr [[TMP35]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
@@ -1399,16 +1505,16 @@ int bar(int n){
 // CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP40]], align 4
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
 // CHECK1-NEXT:    store i32 0, ptr [[TMP41]], align 4
-// CHECK1-NEXT:    [[TMP42:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP42:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.region_id, ptr [[KERNEL_ARGS]])
 // CHECK1-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK1-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP44:%.*]] = mul nsw i64 1, [[TMP2]]
@@ -1490,9 +1596,9 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
 // CHECK1-NEXT:    store ptr [[TMP20]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr @.offload_sizes.15, ptr [[TMP25]], align 8
+// CHECK1-NEXT:    store ptr @.offload_sizes.17, ptr [[TMP25]], align 8
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes.16, ptr [[TMP26]], align 8
+// CHECK1-NEXT:    store ptr @.offload_maptypes.18, ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK1-NEXT:    store ptr null, ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
@@ -1507,16 +1613,16 @@ int bar(int n){
 // CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP32]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
 // CHECK1-NEXT:    store i32 0, ptr [[TMP33]], align 4
-// CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.region_id, ptr [[KERNEL_ARGS]])
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK1-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -1578,9 +1684,9 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
 // CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP19]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr @.offload_sizes.17, ptr [[TMP20]], align 8
+// CHECK1-NEXT:    store ptr @.offload_sizes.19, ptr [[TMP20]], align 8
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes.18, ptr [[TMP21]], align 8
+// CHECK1-NEXT:    store ptr @.offload_maptypes.20, ptr [[TMP21]], align 8
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK1-NEXT:    store ptr null, ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
@@ -1595,23 +1701,23 @@ int bar(int n){
 // CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
 // CHECK1-NEXT:    store i32 0, ptr [[TMP28]], align 4
-// CHECK1-NEXT:    [[TMP29:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP29:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.region_id, ptr [[KERNEL_ARGS]])
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK1-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
 // CHECK1-NEXT:    ret i32 [[TMP31]]
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233
 // CHECK1-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
@@ -1632,11 +1738,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[B_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B_CASTED]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.omp_outlined, ptr [[TMP0]], i64 [[TMP5]], i64 [[TMP1]], i64 [[TMP2]], ptr [[TMP3]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.omp_outlined, ptr [[TMP0]], i64 [[TMP5]], i64 [[TMP1]], i64 [[TMP2]], ptr [[TMP3]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1674,7 +1780,7 @@ int bar(int n){
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215
 // CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -1698,11 +1804,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8, ptr [[AAA_ADDR]], align 1
 // CHECK1-NEXT:    store i8 [[TMP5]], ptr [[AAA_CASTED]], align 1
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[AAA_CASTED]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.omp_outlined, i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], ptr [[TMP0]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.omp_outlined, i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], ptr [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1738,7 +1844,7 @@ int bar(int n){
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198
 // CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -1756,11 +1862,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK1-NEXT:    store i16 [[TMP3]], ptr [[AA_CASTED]], align 2
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[AA_CASTED]], align 8
-// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.omp_outlined, i64 [[TMP2]], i64 [[TMP4]], ptr [[TMP0]])
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.omp_outlined, i64 [[TMP2]], i64 [[TMP4]], ptr [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.omp_outlined
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.omp_outlined
 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1830,22 +1936,28 @@ int bar(int n){
 // CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS12:%.*]] = alloca [2 x ptr], align 4
 // CHECK3-NEXT:    [[KERNEL_ARGS13:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    [[A_CASTED16:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS19:%.*]] = alloca [9 x ptr], align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS20:%.*]] = alloca [9 x ptr], align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS21:%.*]] = alloca [9 x ptr], align 4
+// CHECK3-NEXT:    [[AA_CASTED17:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS18:%.*]] = alloca [2 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS19:%.*]] = alloca [2 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS20:%.*]] = alloca [2 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS21:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    [[A_CASTED24:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [9 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [9 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [9 x ptr], align 4
 // CHECK3-NEXT:    [[DOTOFFLOAD_SIZES:%.*]] = alloca [9 x i64], align 4
-// CHECK3-NEXT:    [[KERNEL_ARGS22:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    [[NN:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[NN_CASTED:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS27:%.*]] = alloca [1 x ptr], align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS28:%.*]] = alloca [1 x ptr], align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS29:%.*]] = alloca [1 x ptr], align 4
-// CHECK3-NEXT:    [[KERNEL_ARGS30:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
-// CHECK3-NEXT:    [[NN_CASTED33:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS34:%.*]] = alloca [1 x ptr], align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS35:%.*]] = alloca [1 x ptr], align 4
-// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS36:%.*]] = alloca [1 x ptr], align 4
-// CHECK3-NEXT:    [[KERNEL_ARGS37:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS35:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS36:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS37:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS38:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    [[NN_CASTED41:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS42:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS43:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS44:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS45:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
@@ -2030,208 +2142,261 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP93:%.*]] = load i32, ptr [[A]], align 4
 // CHECK3-NEXT:    store i32 [[TMP93]], ptr [[A_CASTED16]], align 4
 // CHECK3-NEXT:    [[TMP94:%.*]] = load i32, ptr [[A_CASTED16]], align 4
-// CHECK3-NEXT:    [[TMP95:%.*]] = load i32, ptr [[N_ADDR]], align 4
-// CHECK3-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[TMP95]], 20
-// CHECK3-NEXT:    br i1 [[CMP17]], label [[OMP_IF_THEN18:%.*]], label [[OMP_IF_ELSE25:%.*]]
-// CHECK3:       omp_if.then18:
-// CHECK3-NEXT:    [[TMP96:%.*]] = mul nuw i32 [[TMP1]], 4
-// CHECK3-NEXT:    [[TMP97:%.*]] = sext i32 [[TMP96]] to i64
-// CHECK3-NEXT:    [[TMP98:%.*]] = mul nuw i32 5, [[TMP3]]
-// CHECK3-NEXT:    [[TMP99:%.*]] = mul nuw i32 [[TMP98]], 8
-// CHECK3-NEXT:    [[TMP100:%.*]] = sext i32 [[TMP99]] to i64
-// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes.5, i32 72, i1 false)
-// CHECK3-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 [[TMP94]], ptr [[TMP101]], align 4
-// CHECK3-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 [[TMP94]], ptr [[TMP102]], align 4
-// CHECK3-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr null, ptr [[TMP103]], align 4
-// CHECK3-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 1
-// CHECK3-NEXT:    store ptr [[B]], ptr [[TMP104]], align 4
-// CHECK3-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 1
-// CHECK3-NEXT:    store ptr [[B]], ptr [[TMP105]], align 4
-// CHECK3-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 1
-// CHECK3-NEXT:    store ptr null, ptr [[TMP106]], align 4
-// CHECK3-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 2
-// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[TMP107]], align 4
-// CHECK3-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 2
-// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[TMP108]], align 4
-// CHECK3-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr null, ptr [[TMP109]], align 4
-// CHECK3-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[VLA]], ptr [[TMP110]], align 4
-// CHECK3-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[VLA]], ptr [[TMP111]], align 4
-// CHECK3-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 3
-// CHECK3-NEXT:    store i64 [[TMP97]], ptr [[TMP112]], align 4
-// CHECK3-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr null, ptr [[TMP113]], align 4
-// CHECK3-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr [[C]], ptr [[TMP114]], align 4
-// CHECK3-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr [[C]], ptr [[TMP115]], align 4
-// CHECK3-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr null, ptr [[TMP116]], align 4
-// CHECK3-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 5
-// CHECK3-NEXT:    store i32 5, ptr [[TMP117]], align 4
-// CHECK3-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 5
-// CHECK3-NEXT:    store i32 5, ptr [[TMP118]], align 4
-// CHECK3-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr null, ptr [[TMP119]], align 4
-// CHECK3-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 6
-// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP120]], align 4
-// CHECK3-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 6
-// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP121]], align 4
-// CHECK3-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP122]], align 4
-// CHECK3-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr [[VLA1]], ptr [[TMP123]], align 4
-// CHECK3-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr [[VLA1]], ptr [[TMP124]], align 4
-// CHECK3-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 7
-// CHECK3-NEXT:    store i64 [[TMP100]], ptr [[TMP125]], align 4
-// CHECK3-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP126]], align 4
-// CHECK3-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 8
-// CHECK3-NEXT:    store ptr [[D]], ptr [[TMP127]], align 4
-// CHECK3-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 8
-// CHECK3-NEXT:    store ptr [[D]], ptr [[TMP128]], align 4
-// CHECK3-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS21]], i32 0, i32 8
-// CHECK3-NEXT:    store ptr null, ptr [[TMP129]], align 4
-// CHECK3-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP133]], align 4
-// CHECK3-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 9, ptr [[TMP134]], align 4
-// CHECK3-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr [[TMP130]], ptr [[TMP135]], align 4
-// CHECK3-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[TMP131]], ptr [[TMP136]], align 4
-// CHECK3-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr [[TMP132]], ptr [[TMP137]], align 4
-// CHECK3-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP138]], align 4
-// CHECK3-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP139]], align 4
-// CHECK3-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 7
+// CHECK3-NEXT:    [[TMP95:%.*]] = load i16, ptr [[AA]], align 2
+// CHECK3-NEXT:    store i16 [[TMP95]], ptr [[AA_CASTED17]], align 2
+// CHECK3-NEXT:    [[TMP96:%.*]] = load i32, ptr [[AA_CASTED17]], align 4
+// CHECK3-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP94]], ptr [[TMP97]], align 4
+// CHECK3-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP94]], ptr [[TMP98]], align 4
+// CHECK3-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS20]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP99]], align 4
+// CHECK3-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP96]], ptr [[TMP100]], align 4
+// CHECK3-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP96]], ptr [[TMP101]], align 4
+// CHECK3-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS20]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr null, ptr [[TMP102]], align 4
+// CHECK3-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, ptr [[TMP105]], align 4
+// CHECK3-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 2, ptr [[TMP106]], align 4
+// CHECK3-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP103]], ptr [[TMP107]], align 4
+// CHECK3-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP104]], ptr [[TMP108]], align 4
+// CHECK3-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP109]], align 4
+// CHECK3-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP110]], align 4
+// CHECK3-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP111]], align 4
+// CHECK3-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP112]], align 4
+// CHECK3-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP113]], align 8
+// CHECK3-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, ptr [[TMP114]], align 8
+// CHECK3-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP115]], align 4
+// CHECK3-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP116]], align 4
+// CHECK3-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, ptr [[TMP117]], align 4
+// CHECK3-NEXT:    [[TMP118:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.region_id, ptr [[KERNEL_ARGS21]])
+// CHECK3-NEXT:    [[TMP119:%.*]] = icmp ne i32 [[TMP118]], 0
+// CHECK3-NEXT:    br i1 [[TMP119]], label [[OMP_OFFLOAD_FAILED22:%.*]], label [[OMP_OFFLOAD_CONT23:%.*]]
+// CHECK3:       omp_offload.failed22:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124(i32 [[TMP94]], i32 [[TMP96]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT23]]
+// CHECK3:       omp_offload.cont23:
+// CHECK3-NEXT:    [[TMP120:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    store i32 [[TMP120]], ptr [[A_CASTED24]], align 4
+// CHECK3-NEXT:    [[TMP121:%.*]] = load i32, ptr [[A_CASTED24]], align 4
+// CHECK3-NEXT:    [[TMP122:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[TMP122]], 20
+// CHECK3-NEXT:    br i1 [[CMP25]], label [[OMP_IF_THEN26:%.*]], label [[OMP_IF_ELSE33:%.*]]
+// CHECK3:       omp_if.then26:
+// CHECK3-NEXT:    [[TMP123:%.*]] = mul nuw i32 [[TMP1]], 4
+// CHECK3-NEXT:    [[TMP124:%.*]] = sext i32 [[TMP123]] to i64
+// CHECK3-NEXT:    [[TMP125:%.*]] = mul nuw i32 5, [[TMP3]]
+// CHECK3-NEXT:    [[TMP126:%.*]] = mul nuw i32 [[TMP125]], 8
+// CHECK3-NEXT:    [[TMP127:%.*]] = sext i32 [[TMP126]] to i64
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes.7, i32 72, i1 false)
+// CHECK3-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP121]], ptr [[TMP128]], align 4
+// CHECK3-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP121]], ptr [[TMP129]], align 4
+// CHECK3-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP130]], align 4
+// CHECK3-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr [[B]], ptr [[TMP131]], align 4
+// CHECK3-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr [[B]], ptr [[TMP132]], align 4
+// CHECK3-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr null, ptr [[TMP133]], align 4
+// CHECK3-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[TMP134]], align 4
+// CHECK3-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[TMP135]], align 4
+// CHECK3-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP136]], align 4
+// CHECK3-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[VLA]], ptr [[TMP137]], align 4
+// CHECK3-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[VLA]], ptr [[TMP138]], align 4
+// CHECK3-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 3
+// CHECK3-NEXT:    store i64 [[TMP124]], ptr [[TMP139]], align 4
+// CHECK3-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr null, ptr [[TMP140]], align 4
-// CHECK3-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP141]], align 8
-// CHECK3-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP142]], align 8
-// CHECK3-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP143]], align 4
-// CHECK3-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP144]], align 4
-// CHECK3-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP145]], align 4
-// CHECK3-NEXT:    [[TMP146:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.region_id, ptr [[KERNEL_ARGS22]])
-// CHECK3-NEXT:    [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0
-// CHECK3-NEXT:    br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED23:%.*]], label [[OMP_OFFLOAD_CONT24:%.*]]
-// CHECK3:       omp_offload.failed23:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
-// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT24]]
-// CHECK3:       omp_offload.cont24:
-// CHECK3-NEXT:    br label [[OMP_IF_END26:%.*]]
-// CHECK3:       omp_if.else25:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
-// CHECK3-NEXT:    br label [[OMP_IF_END26]]
-// CHECK3:       omp_if.end26:
-// CHECK3-NEXT:    store i32 0, ptr [[NN]], align 4
-// CHECK3-NEXT:    [[TMP148:%.*]] = load i32, ptr [[NN]], align 4
-// CHECK3-NEXT:    store i32 [[TMP148]], ptr [[NN_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP149:%.*]] = load i32, ptr [[NN_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 [[TMP149]], ptr [[TMP150]], align 4
-// CHECK3-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 [[TMP149]], ptr [[TMP151]], align 4
-// CHECK3-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr null, ptr [[TMP152]], align 4
-// CHECK3-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP155]], align 4
-// CHECK3-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 1, ptr [[TMP156]], align 4
-// CHECK3-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr [[TMP153]], ptr [[TMP157]], align 4
-// CHECK3-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[TMP154]], ptr [[TMP158]], align 4
-// CHECK3-NEXT:    [[TMP159:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP159]], align 4
-// CHECK3-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP160]], align 4
-// CHECK3-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP161]], align 4
-// CHECK3-NEXT:    [[TMP162:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP162]], align 4
-// CHECK3-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP163]], align 8
-// CHECK3-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP164]], align 8
-// CHECK3-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP165]], align 4
-// CHECK3-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP166]], align 4
-// CHECK3-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP167]], align 4
-// CHECK3-NEXT:    [[TMP168:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.region_id, ptr [[KERNEL_ARGS30]])
-// CHECK3-NEXT:    [[TMP169:%.*]] = icmp ne i32 [[TMP168]], 0
-// CHECK3-NEXT:    br i1 [[TMP169]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
+// CHECK3-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr [[C]], ptr [[TMP141]], align 4
+// CHECK3-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr [[C]], ptr [[TMP142]], align 4
+// CHECK3-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr null, ptr [[TMP143]], align 4
+// CHECK3-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 5
+// CHECK3-NEXT:    store i32 5, ptr [[TMP144]], align 4
+// CHECK3-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 5
+// CHECK3-NEXT:    store i32 5, ptr [[TMP145]], align 4
+// CHECK3-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr null, ptr [[TMP146]], align 4
+// CHECK3-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 6
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP147]], align 4
+// CHECK3-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 6
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP148]], align 4
+// CHECK3-NEXT:    [[TMP149:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP149]], align 4
+// CHECK3-NEXT:    [[TMP150:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr [[VLA1]], ptr [[TMP150]], align 4
+// CHECK3-NEXT:    [[TMP151:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr [[VLA1]], ptr [[TMP151]], align 4
+// CHECK3-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 7
+// CHECK3-NEXT:    store i64 [[TMP127]], ptr [[TMP152]], align 4
+// CHECK3-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP153]], align 4
+// CHECK3-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 8
+// CHECK3-NEXT:    store ptr [[D]], ptr [[TMP154]], align 4
+// CHECK3-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 8
+// CHECK3-NEXT:    store ptr [[D]], ptr [[TMP155]], align 4
+// CHECK3-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_MAPPERS29]], i32 0, i32 8
+// CHECK3-NEXT:    store ptr null, ptr [[TMP156]], align 4
+// CHECK3-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP159:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, ptr [[TMP160]], align 4
+// CHECK3-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 9, ptr [[TMP161]], align 4
+// CHECK3-NEXT:    [[TMP162:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP157]], ptr [[TMP162]], align 4
+// CHECK3-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP158]], ptr [[TMP163]], align 4
+// CHECK3-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr [[TMP159]], ptr [[TMP164]], align 4
+// CHECK3-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP165]], align 4
+// CHECK3-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP166]], align 4
+// CHECK3-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP167]], align 4
+// CHECK3-NEXT:    [[TMP168:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP168]], align 8
+// CHECK3-NEXT:    [[TMP169:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, ptr [[TMP169]], align 8
+// CHECK3-NEXT:    [[TMP170:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP170]], align 4
+// CHECK3-NEXT:    [[TMP171:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP171]], align 4
+// CHECK3-NEXT:    [[TMP172:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, ptr [[TMP172]], align 4
+// CHECK3-NEXT:    [[TMP173:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.region_id, ptr [[KERNEL_ARGS30]])
+// CHECK3-NEXT:    [[TMP174:%.*]] = icmp ne i32 [[TMP173]], 0
+// CHECK3-NEXT:    br i1 [[TMP174]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]]
 // CHECK3:       omp_offload.failed31:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154(i32 [[TMP149]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148(i32 [[TMP121]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT32]]
 // CHECK3:       omp_offload.cont32:
-// CHECK3-NEXT:    [[TMP170:%.*]] = load i32, ptr [[NN]], align 4
-// CHECK3-NEXT:    store i32 [[TMP170]], ptr [[NN_CASTED33]], align 4
-// CHECK3-NEXT:    [[TMP171:%.*]] = load i32, ptr [[NN_CASTED33]], align 4
-// CHECK3-NEXT:    [[TMP172:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS34]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 [[TMP171]], ptr [[TMP172]], align 4
-// CHECK3-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS35]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 [[TMP171]], ptr [[TMP173]], align 4
-// CHECK3-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS36]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr null, ptr [[TMP174]], align 4
-// CHECK3-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS34]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS35]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP177]], align 4
-// CHECK3-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 1, ptr [[TMP178]], align 4
-// CHECK3-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr [[TMP175]], ptr [[TMP179]], align 4
-// CHECK3-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[TMP176]], ptr [[TMP180]], align 4
-// CHECK3-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP181]], align 4
-// CHECK3-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP182]], align 4
-// CHECK3-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP183]], align 4
-// CHECK3-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP184]], align 4
-// CHECK3-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP185]], align 8
-// CHECK3-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 0, ptr [[TMP186]], align 8
-// CHECK3-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP187]], align 4
-// CHECK3-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP188]], align 4
-// CHECK3-NEXT:    [[TMP189:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS37]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP189]], align 4
-// CHECK3-NEXT:    [[TMP190:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.region_id, ptr [[KERNEL_ARGS37]])
-// CHECK3-NEXT:    [[TMP191:%.*]] = icmp ne i32 [[TMP190]], 0
-// CHECK3-NEXT:    br i1 [[TMP191]], label [[OMP_OFFLOAD_FAILED38:%.*]], label [[OMP_OFFLOAD_CONT39:%.*]]
-// CHECK3:       omp_offload.failed38:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157(i32 [[TMP171]]) #[[ATTR3]]
-// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT39]]
-// CHECK3:       omp_offload.cont39:
-// CHECK3-NEXT:    [[TMP192:%.*]] = load i32, ptr [[A]], align 4
-// CHECK3-NEXT:    [[TMP193:%.*]] = load ptr, ptr [[SAVED_STACK]], align 4
-// CHECK3-NEXT:    call void @llvm.stackrestore.p0(ptr [[TMP193]])
-// CHECK3-NEXT:    ret i32 [[TMP192]]
+// CHECK3-NEXT:    br label [[OMP_IF_END34:%.*]]
+// CHECK3:       omp_if.else33:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148(i32 [[TMP121]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_IF_END34]]
+// CHECK3:       omp_if.end34:
+// CHECK3-NEXT:    store i32 0, ptr [[NN]], align 4
+// CHECK3-NEXT:    [[TMP175:%.*]] = load i32, ptr [[NN]], align 4
+// CHECK3-NEXT:    store i32 [[TMP175]], ptr [[NN_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP176:%.*]] = load i32, ptr [[NN_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP176]], ptr [[TMP177]], align 4
+// CHECK3-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP176]], ptr [[TMP178]], align 4
+// CHECK3-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS37]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP179]], align 4
+// CHECK3-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, ptr [[TMP182]], align 4
+// CHECK3-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 1, ptr [[TMP183]], align 4
+// CHECK3-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP180]], ptr [[TMP184]], align 4
+// CHECK3-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP181]], ptr [[TMP185]], align 4
+// CHECK3-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP186]], align 4
+// CHECK3-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP187]], align 4
+// CHECK3-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP188]], align 4
+// CHECK3-NEXT:    [[TMP189:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP189]], align 4
+// CHECK3-NEXT:    [[TMP190:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP190]], align 8
+// CHECK3-NEXT:    [[TMP191:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, ptr [[TMP191]], align 8
+// CHECK3-NEXT:    [[TMP192:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP192]], align 4
+// CHECK3-NEXT:    [[TMP193:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP193]], align 4
+// CHECK3-NEXT:    [[TMP194:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, ptr [[TMP194]], align 4
+// CHECK3-NEXT:    [[TMP195:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.region_id, ptr [[KERNEL_ARGS38]])
+// CHECK3-NEXT:    [[TMP196:%.*]] = icmp ne i32 [[TMP195]], 0
+// CHECK3-NEXT:    br i1 [[TMP196]], label [[OMP_OFFLOAD_FAILED39:%.*]], label [[OMP_OFFLOAD_CONT40:%.*]]
+// CHECK3:       omp_offload.failed39:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP176]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT40]]
+// CHECK3:       omp_offload.cont40:
+// CHECK3-NEXT:    [[TMP197:%.*]] = load i32, ptr [[NN]], align 4
+// CHECK3-NEXT:    store i32 [[TMP197]], ptr [[NN_CASTED41]], align 4
+// CHECK3-NEXT:    [[TMP198:%.*]] = load i32, ptr [[NN_CASTED41]], align 4
+// CHECK3-NEXT:    [[TMP199:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS42]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP198]], ptr [[TMP199]], align 4
+// CHECK3-NEXT:    [[TMP200:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS43]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP198]], ptr [[TMP200]], align 4
+// CHECK3-NEXT:    [[TMP201:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS44]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP201]], align 4
+// CHECK3-NEXT:    [[TMP202:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS42]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP203:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS43]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP204:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 2, ptr [[TMP204]], align 4
+// CHECK3-NEXT:    [[TMP205:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 1, ptr [[TMP205]], align 4
+// CHECK3-NEXT:    [[TMP206:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP202]], ptr [[TMP206]], align 4
+// CHECK3-NEXT:    [[TMP207:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP203]], ptr [[TMP207]], align 4
+// CHECK3-NEXT:    [[TMP208:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP208]], align 4
+// CHECK3-NEXT:    [[TMP209:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP209]], align 4
+// CHECK3-NEXT:    [[TMP210:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP210]], align 4
+// CHECK3-NEXT:    [[TMP211:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP211]], align 4
+// CHECK3-NEXT:    [[TMP212:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP212]], align 8
+// CHECK3-NEXT:    [[TMP213:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, ptr [[TMP213]], align 8
+// CHECK3-NEXT:    [[TMP214:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP214]], align 4
+// CHECK3-NEXT:    [[TMP215:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP215]], align 4
+// CHECK3-NEXT:    [[TMP216:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 0, ptr [[TMP216]], align 4
+// CHECK3-NEXT:    [[TMP217:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.region_id, ptr [[KERNEL_ARGS45]])
+// CHECK3-NEXT:    [[TMP218:%.*]] = icmp ne i32 [[TMP217]], 0
+// CHECK3-NEXT:    br i1 [[TMP218]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]]
+// CHECK3:       omp_offload.failed46:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163(i32 [[TMP198]]) #[[ATTR3]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT47]]
+// CHECK3:       omp_offload.cont47:
+// CHECK3-NEXT:    [[TMP219:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP220:%.*]] = load ptr, ptr [[SAVED_STACK]], align 4
+// CHECK3-NEXT:    call void @llvm.stackrestore.p0(ptr [[TMP220]])
+// CHECK3-NEXT:    ret i32 [[TMP219]]
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101
@@ -2324,68 +2489,68 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
-// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !26
 // CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP23]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP23]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP24]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP24]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP25]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP25]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP26]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP26]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP27]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr null, ptr [[TMP27]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP28]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr null, ptr [[TMP28]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP29]], align 8, !noalias !25
+// CHECK3-NEXT:    store i64 0, ptr [[TMP29]], align 8, !noalias !26
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 1, ptr [[TMP30]], align 8, !noalias !25
+// CHECK3-NEXT:    store i64 1, ptr [[TMP30]], align 8, !noalias !26
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] [[TMP20]], ptr [[TMP31]], align 4, !noalias !25
+// CHECK3-NEXT:    store [3 x i32] [[TMP20]], ptr [[TMP31]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] [[TMP21]], ptr [[TMP32]], align 4, !noalias !25
+// CHECK3-NEXT:    store [3 x i32] [[TMP21]], ptr [[TMP32]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP33]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 0, ptr [[TMP33]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP18]], i32 [[TMP19]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK3-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK3-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK3-NEXT:    store i16 [[TMP36]], ptr [[AA_CASTED_I]], align 2, !noalias !25
-// CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store i16 [[TMP36]], ptr [[AA_CASTED_I]], align 2, !noalias !26
+// CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK3-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK3-NEXT:    store i32 [[TMP40]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 [[TMP40]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !26
 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i32 [[TMP37]], i32 [[TMP39]], i32 [[TMP41]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK3:       .omp_outlined..exit:
@@ -2491,7 +2656,48 @@ int bar(int n){
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124
+// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK3-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.omp_outlined, i32 [[TMP1]], i32 [[TMP3]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK3-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK3-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK3-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148
 // CHECK3-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -2524,11 +2730,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 9, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined, i32 [[TMP9]], ptr [[TMP0]], i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], i32 [[TMP4]], i32 [[TMP5]], ptr [[TMP6]], ptr [[TMP7]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 9, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.omp_outlined, i32 [[TMP9]], ptr [[TMP0]], i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], i32 [[TMP4]], i32 [[TMP5]], ptr [[TMP6]], ptr [[TMP7]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2600,7 +2806,7 @@ int bar(int n){
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160
 // CHECK3-SAME: (i32 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[NN_ADDR:%.*]] = alloca i32, align 4
@@ -2609,11 +2815,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[NN_CASTED]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined, i32 [[TMP1]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined, i32 [[TMP1]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2626,11 +2832,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[NN_CASTED]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined.omp_outlined, i32 [[TMP1]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined.omp_outlined, i32 [[TMP1]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2642,7 +2848,7 @@ int bar(int n){
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163
 // CHECK3-SAME: (i32 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[NN_ADDR:%.*]] = alloca i32, align 4
@@ -2651,11 +2857,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[NN_CASTED]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined, i32 [[TMP1]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined, i32 [[TMP1]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[NN:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2664,11 +2870,11 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[NN]], ptr [[NN_ADDR]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined.omp_outlined, ptr [[NN_ADDR]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined.omp_outlined, ptr [[NN_ADDR]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[NN:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2710,9 +2916,9 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.13, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store ptr @.offload_maptypes.14, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
@@ -2727,27 +2933,27 @@ int bar(int n){
 // CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
 // CHECK3-NEXT:    store i32 0, ptr [[TMP18]], align 4
-// CHECK3-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.region_id, ptr [[KERNEL_ARGS]])
 // CHECK3-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
 // CHECK3-NEXT:    br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182(i32 [[TMP0]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188(i32 [[TMP0]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188
 // CHECK3-SAME: (i32 noundef [[VLA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.omp_outlined, i32 [[TMP0]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined, i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[VLA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2830,7 +3036,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP7:%.*]] = mul nuw i32 2, [[TMP1]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 2
 // CHECK3-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes.13, i32 40, i1 false)
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes.15, i32 40, i1 false)
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[THIS1]], ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -2877,7 +3083,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
 // CHECK3-NEXT:    store ptr [[TMP28]], ptr [[TMP33]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes.14, ptr [[TMP34]], align 4
+// CHECK3-NEXT:    store ptr @.offload_maptypes.16, ptr [[TMP34]], align 4
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK3-NEXT:    store ptr null, ptr [[TMP35]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
@@ -2892,16 +3098,16 @@ int bar(int n){
 // CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP40]], align 4
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
 // CHECK3-NEXT:    store i32 0, ptr [[TMP41]], align 4
-// CHECK3-NEXT:    [[TMP42:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP42:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.region_id, ptr [[KERNEL_ARGS]])
 // CHECK3-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK3-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP44:%.*]] = mul nsw i32 1, [[TMP1]]
@@ -2983,9 +3189,9 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr [[TMP20]], ptr [[TMP24]], align 4
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr @.offload_sizes.15, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.17, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes.16, ptr [[TMP26]], align 4
+// CHECK3-NEXT:    store ptr @.offload_maptypes.18, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK3-NEXT:    store ptr null, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
@@ -3000,16 +3206,16 @@ int bar(int n){
 // CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP32]], align 4
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
 // CHECK3-NEXT:    store i32 0, ptr [[TMP33]], align 4
-// CHECK3-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.region_id, ptr [[KERNEL_ARGS]])
 // CHECK3-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK3-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -3071,9 +3277,9 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr @.offload_sizes.17, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.19, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes.18, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store ptr @.offload_maptypes.20, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
 // CHECK3-NEXT:    store ptr null, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
@@ -3088,23 +3294,23 @@ int bar(int n){
 // CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
 // CHECK3-NEXT:    store i32 0, ptr [[TMP28]], align 4
-// CHECK3-NEXT:    [[TMP29:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP29:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.region_id, ptr [[KERNEL_ARGS]])
 // CHECK3-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK3-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
 // CHECK3-NEXT:    ret i32 [[TMP31]]
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233
 // CHECK3-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
@@ -3125,11 +3331,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[B_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B_CASTED]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.omp_outlined, ptr [[TMP0]], i32 [[TMP5]], i32 [[TMP1]], i32 [[TMP2]], ptr [[TMP3]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.omp_outlined, ptr [[TMP0]], i32 [[TMP5]], i32 [[TMP1]], i32 [[TMP2]], ptr [[TMP3]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3167,7 +3373,7 @@ int bar(int n){
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215
 // CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -3191,11 +3397,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8, ptr [[AAA_ADDR]], align 1
 // CHECK3-NEXT:    store i8 [[TMP5]], ptr [[AAA_CASTED]], align 1
 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[AAA_CASTED]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.omp_outlined, i32 [[TMP2]], i32 [[TMP4]], i32 [[TMP6]], ptr [[TMP0]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.omp_outlined, i32 [[TMP2]], i32 [[TMP4]], i32 [[TMP6]], ptr [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3231,7 +3437,7 @@ int bar(int n){
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198
 // CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -3249,11 +3455,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK3-NEXT:    store i16 [[TMP3]], ptr [[AA_CASTED]], align 2
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[AA_CASTED]], align 4
-// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.omp_outlined, i32 [[TMP2]], i32 [[TMP4]], ptr [[TMP0]])
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.omp_outlined, i32 [[TMP2]], i32 [[TMP4]], ptr [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.omp_outlined
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.omp_outlined
 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3393,7 +3599,48 @@ int bar(int n){
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124
+// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK9-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i64, ptr [[AA_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.omp_outlined, i64 [[TMP1]], i64 [[TMP3]])
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK9-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK9-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK9-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK9-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK9-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148
 // CHECK9-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3426,11 +3673,11 @@ int bar(int n){
 // CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP9:%.*]] = load i64, ptr [[A_CASTED]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 9, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined, i64 [[TMP9]], ptr [[TMP0]], i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP4]], i64 [[TMP5]], ptr [[TMP6]], ptr [[TMP7]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 9, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.omp_outlined, i64 [[TMP9]], ptr [[TMP0]], i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], i64 [[TMP4]], i64 [[TMP5]], ptr [[TMP6]], ptr [[TMP7]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3502,7 +3749,7 @@ int bar(int n){
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160
 // CHECK9-SAME: (i64 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[NN_ADDR:%.*]] = alloca i64, align 8
@@ -3511,11 +3758,11 @@ int bar(int n){
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[NN_CASTED]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined, i64 [[TMP1]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined, i64 [[TMP1]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3528,11 +3775,11 @@ int bar(int n){
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[NN_CASTED]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined.omp_outlined, i64 [[TMP1]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined.omp_outlined, i64 [[TMP1]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3544,7 +3791,7 @@ int bar(int n){
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163
 // CHECK9-SAME: (i64 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[NN_ADDR:%.*]] = alloca i64, align 8
@@ -3553,11 +3800,11 @@ int bar(int n){
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[NN_CASTED]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined, i64 [[TMP1]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined, i64 [[TMP1]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3566,11 +3813,11 @@ int bar(int n){
 // CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[NN]], ptr [[NN_ADDR]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined.omp_outlined, ptr [[NN_ADDR]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined.omp_outlined, ptr [[NN_ADDR]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[NN:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3583,17 +3830,17 @@ int bar(int n){
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188
 // CHECK9-SAME: (i64 noundef [[VLA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.omp_outlined, i64 [[TMP0]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined, i64 [[TMP0]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3607,7 +3854,7 @@ int bar(int n){
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215
 // CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3631,11 +3878,11 @@ int bar(int n){
 // CHECK9-NEXT:    [[TMP5:%.*]] = load i8, ptr [[AAA_ADDR]], align 1
 // CHECK9-NEXT:    store i8 [[TMP5]], ptr [[AAA_CASTED]], align 1
 // CHECK9-NEXT:    [[TMP6:%.*]] = load i64, ptr [[AAA_CASTED]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.omp_outlined, i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], ptr [[TMP0]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.omp_outlined, i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], ptr [[TMP0]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3671,7 +3918,7 @@ int bar(int n){
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233
 // CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
@@ -3692,11 +3939,11 @@ int bar(int n){
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP4]], ptr [[B_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B_CASTED]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.omp_outlined, ptr [[TMP0]], i64 [[TMP5]], i64 [[TMP1]], i64 [[TMP2]], ptr [[TMP3]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.omp_outlined, ptr [[TMP0]], i64 [[TMP5]], i64 [[TMP1]], i64 [[TMP2]], ptr [[TMP3]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3734,7 +3981,7 @@ int bar(int n){
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198
 // CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3752,11 +3999,11 @@ int bar(int n){
 // CHECK9-NEXT:    [[TMP3:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK9-NEXT:    store i16 [[TMP3]], ptr [[AA_CASTED]], align 2
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[AA_CASTED]], align 8
-// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.omp_outlined, i64 [[TMP2]], i64 [[TMP4]], ptr [[TMP0]])
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.omp_outlined, i64 [[TMP2]], i64 [[TMP4]], ptr [[TMP0]])
 // CHECK9-NEXT:    ret void
 //
 //
-// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.omp_outlined
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.omp_outlined
 // CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3889,7 +4136,48 @@ int bar(int n){
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124
+// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK11-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.omp_outlined, i32 [[TMP1]], i32 [[TMP3]])
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], 1
+// CHECK11-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK11-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK11-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK11-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK11-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148
 // CHECK11-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -3922,11 +4210,11 @@ int bar(int n){
 // CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 9, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined, i32 [[TMP9]], ptr [[TMP0]], i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], i32 [[TMP4]], i32 [[TMP5]], ptr [[TMP6]], ptr [[TMP7]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 9, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.omp_outlined, i32 [[TMP9]], ptr [[TMP0]], i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], i32 [[TMP4]], i32 [[TMP5]], ptr [[TMP6]], ptr [[TMP7]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3998,7 +4286,7 @@ int bar(int n){
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160
 // CHECK11-SAME: (i32 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[NN_ADDR:%.*]] = alloca i32, align 4
@@ -4007,11 +4295,11 @@ int bar(int n){
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[NN_CASTED]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined, i32 [[TMP1]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined, i32 [[TMP1]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4024,11 +4312,11 @@ int bar(int n){
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[NN_CASTED]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined.omp_outlined, i32 [[TMP1]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined.omp_outlined, i32 [[TMP1]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l154.omp_outlined.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4040,7 +4328,7 @@ int bar(int n){
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163
 // CHECK11-SAME: (i32 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[NN_ADDR:%.*]] = alloca i32, align 4
@@ -4049,11 +4337,11 @@ int bar(int n){
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[NN_CASTED]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined, i32 [[TMP1]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined, i32 [[TMP1]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4062,11 +4350,11 @@ int bar(int n){
 // CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[NN]], ptr [[NN_ADDR]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined.omp_outlined, ptr [[NN_ADDR]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined.omp_outlined, ptr [[NN_ADDR]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l157.omp_outlined.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163.omp_outlined.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[NN:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4079,17 +4367,17 @@ int bar(int n){
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188
 // CHECK11-SAME: (i32 noundef [[VLA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.omp_outlined, i32 [[TMP0]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined, i32 [[TMP0]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l182.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[VLA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4103,7 +4391,7 @@ int bar(int n){
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215
 // CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -4127,11 +4415,11 @@ int bar(int n){
 // CHECK11-NEXT:    [[TMP5:%.*]] = load i8, ptr [[AAA_ADDR]], align 1
 // CHECK11-NEXT:    store i8 [[TMP5]], ptr [[AAA_CASTED]], align 1
 // CHECK11-NEXT:    [[TMP6:%.*]] = load i32, ptr [[AAA_CASTED]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.omp_outlined, i32 [[TMP2]], i32 [[TMP4]], i32 [[TMP6]], ptr [[TMP0]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.omp_outlined, i32 [[TMP2]], i32 [[TMP4]], i32 [[TMP6]], ptr [[TMP0]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l209.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4167,7 +4455,7 @@ int bar(int n){
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233
 // CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
@@ -4188,11 +4476,11 @@ int bar(int n){
 // CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP4]], ptr [[B_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B_CASTED]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.omp_outlined, ptr [[TMP0]], i32 [[TMP5]], i32 [[TMP1]], i32 [[TMP2]], ptr [[TMP3]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.omp_outlined, ptr [[TMP0]], i32 [[TMP5]], i32 [[TMP1]], i32 [[TMP2]], ptr [[TMP3]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l227.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4230,7 +4518,7 @@ int bar(int n){
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198
 // CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -4248,11 +4536,11 @@ int bar(int n){
 // CHECK11-NEXT:    [[TMP3:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK11-NEXT:    store i16 [[TMP3]], ptr [[AA_CASTED]], align 2
 // CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[AA_CASTED]], align 4
-// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.omp_outlined, i32 [[TMP2]], i32 [[TMP4]], ptr [[TMP0]])
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.omp_outlined, i32 [[TMP2]], i32 [[TMP4]], ptr [[TMP0]])
 // CHECK11-NEXT:    ret void
 //
 //
-// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l192.omp_outlined
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198.omp_outlined
 // CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/target_visibility.cpp b/clang/test/OpenMP/target_visibility.cpp
index 938d164df89bff..2554f653170b94 100644
--- a/clang/test/OpenMP/target_visibility.cpp
+++ b/clang/test/OpenMP/target_visibility.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s
-// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s
 // expected-no-diagnostics
 
 
@@ -21,6 +21,14 @@ void B::bar() { A a; a.foo(); }
 void B::sbar() { A::sfoo(); }
 #pragma omp declare target to(B::bar, B::sbar)
 
+[[gnu::visibility("hidden")]] extern const int x = 0;
+#pragma omp declare target to(x) device_type(nohost)
+
+[[gnu::visibility("hidden")]] int y = 0;
+#pragma omp declare target to(y)
+
+// CHECK-DAG: @x = hidden{{.*}} constant i32 0
+// CHECK-DAG: @y = protected{{.*}} i32 0
 // CHECK-DAG: define hidden void @_ZN1B4sbarEv()
 // CHECK-DAG: define linkonce_odr hidden void @_ZN1A4sfooEv()
 // CHECK-DAG: define hidden void @_ZN1B3barEv(
diff --git a/clang/test/Parser/c2x-auto.c b/clang/test/Parser/c2x-auto.c
new file mode 100644
index 00000000000000..b878a5b7c42d47
--- /dev/null
+++ b/clang/test/Parser/c2x-auto.c
@@ -0,0 +1,132 @@
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,c23 -std=c23 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,c17 -std=c17 %s
+
+#define AUTO_MACRO(_NAME, ARG, ARG2, ARG3) \
+auto _NAME = ARG + (ARG2 / ARG3);
+
+struct S {
+  int a;
+  auto b;       // c23-error {{'auto' not allowed in struct member}} \
+                   c17-error {{type name does not allow storage class to be specified}} \
+                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+  union {
+    char c;
+    auto smth;  // c23-error {{'auto' not allowed in union member}} \
+                   c17-error {{type name does not allow storage class to be specified}} \
+                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+  } u;
+};
+
+enum E : auto { // c23-error {{'auto' not allowed here}} \
+                   c17-error {{expected a type}} \
+                   c17-error {{type name does not allow storage class to be specified}}
+  One,
+  Two,
+  Tree,
+};
+
+auto basic_usage(auto auto) {   // c23-error {{'auto' not allowed in function prototype}} \
+                                   c23-error {{'auto' not allowed in function return type}} \
+                                   c23-error {{cannot combine with previous 'auto' declaration specifier}} \
+                                   c17-error {{invalid storage class specifier in function declarator}} \
+                                   c17-error {{illegal storage class on function}} \
+                                   c17-warning {{duplicate 'auto' declaration specifier}} \
+                                   c17-warning {{omitting the parameter name in a function definition is a C23 extension}} \
+                                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}} \
+                                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  auto = 4;                     // expected-error {{expected identifier or '('}}
+
+  auto a = 4;                   // c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  auto auto aa = 12;            // c23-error {{cannot combine with previous 'auto' declaration specifier}} \
+                                   c17-warning {{duplicate 'auto' declaration specifier}} \
+                                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  auto b[4];                    // c23-error {{'auto' not allowed in array declaration}} \
+                                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  auto array[auto];             // expected-error {{expected expression}} \
+                                   c23-error {{declaration of variable 'array' with deduced type 'auto' requires an initializer}} \
+                                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  AUTO_MACRO(auto, 1, 2, 3);    // c23-error {{cannot combine with previous 'auto' declaration specifier}} \
+                                   expected-error {{expected identifier or '('}} \
+                                   c17-warning {{duplicate 'auto' declaration specifier}}
+
+  auto c = (auto)a;             // expected-error {{expected expression}} \
+                                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  auto ci = (auto){12};         // expected-error {{expected expression}} \
+                                   c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  int auto_cxx_decl = auto(0);  // expected-error {{expected expression}}
+
+  return c;
+}
+
+void structs(void) {
+  struct s_auto { auto a; };            // c23-error {{'auto' not allowed in struct member}} \
+                                           c17-error {{type name does not allow storage class to be specified}} \
+                                           c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  // FIXME: this should end up being rejected when we implement underspecified
+  // declarations in N3006.
+  auto s_int = (struct { int a; } *)0;  // c17-error {{incompatible pointer to integer conversion initializing 'int' with an expression of type}} \
+                                           c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  typedef auto auto_type;               // c23-error {{'auto' not allowed in typedef}} \
+                                           c17-error {{cannot combine with previous 'typedef' declaration specifier}} \
+                                           c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+}
+
+void sizeof_alignas(void) {
+  auto auto_size = sizeof(auto);  // expected-error {{expected expression}} \
+                                     c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+}
+
+void generic_alignof_alignas(void) {
+  int g;
+  _Generic(g, auto : 0);  // c23-error {{'auto' not allowed here}} \
+                             c17-error {{expected a type}} \
+                             c17-error {{type name does not allow storage class to be specified}}
+
+  _Alignof(auto);         // expected-error {{expected expression}} \
+                             expected-warning {{'_Alignof' applied to an expression is a GNU extension}}
+
+  _Alignas(auto);         // expected-error {{expected expression}} \
+                             expected-warning {{declaration does not declare anything}}
+}
+
+void function_designators(void) {
+  extern auto auto_ret_func(void);    // c23-error {{'auto' not allowed in function return type}} \
+                                         c17-error {{cannot combine with previous 'extern' declaration specifier}} \
+                                         c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  extern void auto_param_func(auto);  // c23-error {{'auto' not allowed in function prototype}} \
+                                         c17-error {{invalid storage class specifier in function declarator}} \
+                                         c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  auto (auto_ret_func)(void);         // c23-error {{'auto' not allowed in function return type}} \
+                                         c17-error {{illegal storage class on function}} \
+                                         c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  void (auto_param_func)(auto);       // c23-error {{'auto' not allowed in function prototype}} \
+                                         c17-error {{invalid storage class specifier in function declarator}} \
+                                         c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+}
+
+void atomic(void) {
+  _Atomic(auto) atom1 = 12; // c23-error {{'auto' not allowed here}} \
+                               c23-error {{a type specifier is required for all declarations}} \
+                               c17-error {{expected a type}} \
+                               c17-error {{type name does not allow storage class to be specified}} \
+                               c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+
+  _Atomic auto atom2 = 12;  // c23-error {{_Atomic cannot be applied to type 'auto' in C23}} \
+                               c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+}
+
+void attributes(void) {
+  auto ident [[clang::annotate("this works")]] = 12;  // c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
+}
diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c
index d77b699674af4e..c4a9672f0814aa 100644
--- a/clang/test/Preprocessor/predefined-macros.c
+++ b/clang/test/Preprocessor/predefined-macros.c
@@ -290,3 +290,20 @@
 // RUN:   -fcuda-is-device -fgpu-default-stream=per-thread \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
 // CHECK-PTH: #define HIP_API_PER_THREAD_DEFAULT_STREAM 1
+
+// RUN: %clang_cc1 %s -E -dM -o - -x hip --hipstdpar -triple x86_64-unknown-linux-gnu \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-HIPSTDPAR
+// CHECK-HIPSTDPAR: #define __HIPSTDPAR__ 1
+// CHECK-HIPSTDPAR-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
+
+// RUN: %clang_cc1 %s -E -dM -o - -x hip --hipstdpar --hipstdpar-interpose-alloc \
+// RUN:  -triple x86_64-unknown-linux-gnu | FileCheck -match-full-lines %s \
+// RUN:  --check-prefix=CHECK-HIPSTDPAR-INTERPOSE
+// CHECK-HIPSTDPAR-INTERPOSE: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
+// CHECK-HIPSTDPAR-INTERPOSE: #define __HIPSTDPAR__ 1
+
+// RUN: %clang_cc1 %s -E -dM -o - -x hip --hipstdpar --hipstdpar-interpose-alloc \
+// RUN:  -triple amdgcn-amd-amdhsa -fcuda-is-device | FileCheck -match-full-lines \
+// RUN:  %s --check-prefix=CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG
+// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG: #define __HIPSTDPAR__ 1
+// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
\ No newline at end of file
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 4dd83cfa0620b9..242197e3f129a3 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -1001,13 +1001,13 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
 // CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
 
-// RUN: %clang --target=riscv32-unknown-linux-gnu -menable-experimental-extensions \
-// RUN: -march=rv32izfa0p2 -x c -E -dM %s \
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN: -march=rv32izfa -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZFA-EXT %s
-// RUN: %clang --target=riscv64-unknown-linux-gnu -menable-experimental-extensions \
-// RUN: -march=rv64izfa0p2 -x c -E -dM %s \
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN: -march=rv64izfa -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZFA-EXT %s
-// CHECK-ZFA-EXT: __riscv_zfa 2000{{$}}
+// CHECK-ZFA-EXT: __riscv_zfa 1000000{{$}}
 
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
 // RUN: -march=rv32izfbfmin0p8 -x c -E -dM %s \
diff --git a/clang/test/Sema/attr-counted-by.c b/clang/test/Sema/attr-counted-by.c
new file mode 100644
index 00000000000000..28c76ce74118b8
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -fstrict-flex-arrays=3 -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct bar;
+
+struct not_found {
+  int count;
+  struct bar *fam[] __counted_by(bork); // expected-error {{field 'bork' in 'counted_by' not found}}
+};
+
+struct not_found_suggest {
+  int bork; // expected-note {{'bork' declared here}}
+  struct bar *fam[] __counted_by(blork); // expected-error {{field 'blork' in 'counted_by' not found; did you mean 'bork'?}}
+};
+
+int global; // expected-note {{variable 'global' is declared here}}
+
+struct found_outside_of_struct {
+  int bork;
+  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' is not found in struct}}
+};
+
+struct self_referrential {
+  int bork;
+  struct bar *self[] __counted_by(self); // expected-error {{field 'self' in 'counted_by' cannot refer to the flexible array}}
+};
+
+struct non_int {
+  double non_integer; // expected-error {{field 'non_integer' in 'counted_by' is not a non-boolean integer type}}
+  struct bar *fam[] __counted_by(non_integer); // expected-note {{field 'non_integer' declared here}}
+};
+
+struct array_of_ints {
+  int non_integer[2]; // expected-error {{field 'non_integer' in 'counted_by' is not a non-boolean integer type}}
+  struct bar *fam[] __counted_by(non_integer); // expected-note {{field 'non_integer' declared here}}
+};
+
+struct not_a_fam {
+  double non_integer;
+  struct bar *non_fam __counted_by(non_integer); // expected-error {{'counted_by' only applies to flexible array members}}
+};
diff --git a/clang/test/Sema/attr-target-clones-aarch64.c b/clang/test/Sema/attr-target-clones-aarch64.c
index dd8e72c421b135..9adabf87732130 100644
--- a/clang/test/Sema/attr-target-clones-aarch64.c
+++ b/clang/test/Sema/attr-target-clones-aarch64.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple aarch64-linux-gnu  -fsyntax-only -verify %s
 
-void __attribute__((target_clones("fp16+sve2-aes", "sb+sve2-sha3"))) no_def(void);
+void __attribute__((target_clones("fp16+sve2-aes", "sb+sve2-sha3+rcpc3"))) no_def(void);
 
 // expected-warning@+1 {{unsupported 'default' in the 'target_clones' attribute string; 'target_clones' attribute ignored}}
 void __attribute__((target_clones("default+sha3"))) warn1(void);
diff --git a/clang/test/Sema/builtins-x86.c b/clang/test/Sema/builtins-x86.c
index 0882c80514ae9c..cbaf7bcde871e1 100644
--- a/clang/test/Sema/builtins-x86.c
+++ b/clang/test/Sema/builtins-x86.c
@@ -22,19 +22,19 @@ void call_x86_32_builtins(void) {
 }
 
 __m128 test__builtin_ia32_cmpps(__m128 __a, __m128 __b) {
-  return __builtin_ia32_cmpps(__a, __b, 32); // expected-error {{argument value 32 is outside the valid range [0, 7]}}
+  return __builtin_ia32_cmpps(__a, __b, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
 }
 
 __m128d test__builtin_ia32_cmppd(__m128d __a, __m128d __b) {
-  return __builtin_ia32_cmppd(__a, __b, 32); // expected-error {{argument value 32 is outside the valid range [0, 7]}}
+  return __builtin_ia32_cmppd(__a, __b, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
 }
 
 __m128 test__builtin_ia32_cmpss(__m128 __a, __m128 __b) {
-  return __builtin_ia32_cmpss(__a, __b, 32); // expected-error {{argument value 32 is outside the valid range [0, 7]}}
+  return __builtin_ia32_cmpss(__a, __b, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
 }
 
 __m128d test__builtin_ia32_cmpsd(__m128d __a, __m128d __b) {
-  return __builtin_ia32_cmpsd(__a, __b, 32); // expected-error {{argument value 32 is outside the valid range [0, 7]}}
+  return __builtin_ia32_cmpsd(__a, __b, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
 }
 
 __mmask16 test__builtin_ia32_cmpps512_mask(__m512 __a, __m512 __b) {
diff --git a/clang/test/Sema/c2x-auto.c b/clang/test/Sema/c2x-auto.c
new file mode 100644
index 00000000000000..916c179adcf318
--- /dev/null
+++ b/clang/test/Sema/c2x-auto.c
@@ -0,0 +1,137 @@
+// RUN: %clang_cc1 -std=c2x -verify -pedantic -Wno-comments %s
+
+void test_basic_types(void) {
+  auto undefined;     // expected-error {{declaration of variable 'undefined' with deduced type 'auto' requires an initializer}}
+  auto auto_int = 4;
+  auto auto_long = 4UL;
+  signed auto a = 1L; // expected-error {{'auto' cannot be signed or unsigned}}
+
+  _Static_assert(_Generic(auto_int, int : 1));
+  _Static_assert(_Generic(auto_long, unsigned long : 1));
+}
+
+void test_complex_types(void) {
+  _Complex auto i = 12.0; // expected-error {{'_Complex auto' is invalid}}
+}
+
+void test_gnu_extensions(void) {
+  auto t = ({ // expected-warning {{use of GNU statement expression extension}}
+    auto b = 12;
+    b;
+  });
+  _Static_assert(_Generic(t, int : 1));
+}
+
+void test_sizeof_typeof(void) {
+  auto auto_size = sizeof(auto);  // expected-error {{expected expression}}
+  typeof(auto) tpof = 4;          // expected-error {{expected expression}}
+}
+
+void test_casts(void) {
+  auto int_cast = (int)(4 + 3);
+  auto double_cast = (double)(1 / 3);
+  auto long_cast = (long)(4UL + 3UL);
+  auto auto_cast = (auto)(4 + 3); // expected-error {{expected expression}}
+
+  _Static_assert(_Generic(int_cast, int : 1));
+  _Static_assert(_Generic(double_cast, double : 1));
+  _Static_assert(_Generic(long_cast, long : 1));
+}
+
+void test_compound_literral(void) {
+  auto int_cl = (int){13};
+  auto double_cl = (double){2.5};
+  auto array[] = { 1, 2, 3 }; // expected-error {{cannot use 'auto' with array in C}}
+
+  auto auto_cl = (auto){13};  // expected-error {{expected expression}}
+
+  _Static_assert(_Generic(int_cl, int : 1));
+  _Static_assert(_Generic(double_cl, double : 1));
+}
+
+void test_array_pointers(void) {
+  double array[3] = { 0 };
+  auto a = array;
+  auto b = &array;
+
+  _Static_assert(_Generic(array, double * : 1));
+  _Static_assert(_Generic(a, double * : 1));
+  _Static_assert(_Generic(b, double (*)[3] : 1));
+}
+
+void test_typeof() {
+  int typeof_target();
+  auto result = (typeof(typeof_target())){12};
+
+  _Static_assert(_Generic(result, int : 1));
+}
+
+void test_qualifiers(const int y) {
+  const auto a = 12;
+  auto b = y;
+  static auto c = 1UL;
+  int* pa = &a; // expected-warning {{initializing 'int *' with an expression of type 'const int *' discards qualifiers}}
+  const int* pb = &b;
+  int* pc = &c; // expected-warning {{incompatible pointer types initializing 'int *' with an expression of type 'unsigned long *'}}
+
+  _Static_assert(_Generic(a, int : 1));
+  _Static_assert(_Generic(b, int : 1));
+  _Static_assert(_Generic(c, unsigned long : 1));
+  _Static_assert(_Generic(pa, int * : 1));
+  _Static_assert(_Generic(pb, const int * : 1));
+  _Static_assert(_Generic(pc, int * : 1));
+}
+
+void test_strings(void) {
+  auto str = "this is a string";
+  auto str2[] = "this is a string";       // expected-warning {{type inference of a declaration other than a plain identifier with optional trailing attributes is a Clang extension}}
+  auto (str3) = "this is a string";
+  auto (((str4))) = "this is a string";
+
+  _Static_assert(_Generic(str, char * : 1));
+  _Static_assert(_Generic(str2, char * : 1));
+  _Static_assert(_Generic(str3, char * : 1));
+  _Static_assert(_Generic(str4, char * : 1));
+}
+
+void test_pointers(void) {
+  auto a = 12;
+  auto *ptr = &a;                         // expected-warning {{type inference of a declaration other than a plain identifier with optional trailing attributes is a Clang extension}}
+  auto *str = "this is a string";         // expected-warning {{type inference of a declaration other than a plain identifier with optional trailing attributes is a Clang extension}}
+  const auto *str2 = "this is a string";  // expected-warning {{type inference of a declaration other than a plain identifier with optional trailing attributes is a Clang extension}}
+  auto *b = &a;                           // expected-warning {{type inference of a declaration other than a plain identifier with optional trailing attributes is a Clang extension}}
+  *b = &a;                                // expected-error {{incompatible pointer to integer conversion assigning to 'int' from 'int *'; remove &}}
+  auto nptr = nullptr;
+
+  _Static_assert(_Generic(a, int : 1));
+  _Static_assert(_Generic(ptr, int * : 1));
+  _Static_assert(_Generic(str, char * : 1));
+  _Static_assert(_Generic(str2, const char * : 1));
+  _Static_assert(_Generic(b, int * : 1));
+  _Static_assert(_Generic(nptr, typeof(nullptr) : 1));
+}
+
+void test_prototypes(void) {
+  extern void foo(int a, int array[({ auto x = 12; x;})]);  // expected-warning {{use of GNU statement expression extension}}
+}
+
+void test_scopes(void) {
+  double a = 7;
+  double b = 9;
+  {
+    auto a = a * a; // expected-error {{variable 'a' declared with deduced type 'auto' cannot appear in its own initializer}} \
+                       expected-error {{variable 'a' declared with deduced type 'auto' cannot appear in its own initializer}}
+  }
+  {
+    auto b = a * a;
+    auto a = b;
+
+    _Static_assert(_Generic(b, double : 1));
+    _Static_assert(_Generic(a, double : 1));
+  }
+}
+
+[[clang::overloadable]] auto test(auto x) { // expected-error {{'auto' not allowed in function prototype}} \
+                                               expected-error {{'auto' not allowed in function return type}}
+  return x;
+}
diff --git a/clang/test/Sema/fixed-enum.c b/clang/test/Sema/fixed-enum.c
index c77f5b0cbe79c5..954ff8c452b80c 100644
--- a/clang/test/Sema/fixed-enum.c
+++ b/clang/test/Sema/fixed-enum.c
@@ -4,13 +4,18 @@
 // RUN: %clang_cc1 -Weverything -std=c11 -xc -DC11 -verify %s
 // RUN: %clang_cc1 -pedantic    -std=c11 -xc -DC11 -verify %s
 // RUN: %clang_cc1 -Weverything -std=c11 -xc -fms-extensions -DMS -verify %s
+// RUN: %clang_cc1 -Weverything -std=c2x -xc -DC23 -verify %s
+// RUN: %clang_cc1 -pedantic    -std=c2x -xc -DC23 -verify %s
+// RUN: %clang_cc1 -Weverything -std=c23 -xc -DC23 -verify %s
+// RUN: %clang_cc1 -pedantic    -std=c23 -xc -DC23 -verify %s
+// RUN: %clang_cc1 -Weverything -std=c23 -xc -fms-extensions -DC23 -verify %s
 
 enum X : int {e};
 #if defined(CXX11)
 // expected-warning@-2{{enumeration types with a fixed underlying type are incompatible with C++98}}
 #elif defined(CXX03)
 // expected-warning@-4{{enumeration types with a fixed underlying type are a C++11 extension}}
-#elif defined(OBJC)
+#elif defined(OBJC) || defined(C23)
 // No diagnostic
 #elif defined(C11)
 // expected-warning@-8{{enumeration types with a fixed underlying type are a Clang extension}}
@@ -21,19 +26,19 @@ enum X : int {e};
 // Don't warn about the forward declaration in any language mode.
 enum Fwd : int;
 enum Fwd : int { e2 };
-#ifndef OBJC
+#if !defined(OBJC) && !defined(C23)
 // expected-warning@-3 {{enumeration types with a fixed underlying type}}
 // expected-warning@-3 {{enumeration types with a fixed underlying type}}
 #endif
 
 // Always error on the incompatible redeclaration.
 enum BadFwd : int;
-#ifndef OBJC
+#if !defined(OBJC) && !defined(C23)
 // expected-warning@-2 {{enumeration types with a fixed underlying type}}
 #endif
 // expected-note@-4 {{previous declaration is here}}
 enum BadFwd : char { e3 };
-#ifndef OBJC
+#if !defined(OBJC) && !defined(C23)
 // expected-warning@-2 {{enumeration types with a fixed underlying type}}
 #endif
 // expected-error@-4 {{enumeration redeclared with different underlying type 'char' (was 'int')}}
diff --git a/clang/test/SemaCXX/attr-target-version.cpp b/clang/test/SemaCXX/attr-target-version.cpp
index da24503f775f5b..2e262cda367749 100644
--- a/clang/test/SemaCXX/attr-target-version.cpp
+++ b/clang/test/SemaCXX/attr-target-version.cpp
@@ -5,6 +5,7 @@ void __attribute__((target_version("vmull"))) wrong_tv(void);
 
 void __attribute__((target_version("dotprod"))) no_def(void);
 void __attribute__((target_version("rdm+fp"))) no_def(void);
+void __attribute__((target_version("rcpc3"))) no_def(void);
 
 // expected-error@+1 {{no matching function for call to 'no_def'}}
 void foo(void) { no_def(); }
diff --git a/clang/test/SemaCXX/constant-expression-cxx1z.cpp b/clang/test/SemaCXX/constant-expression-cxx1z.cpp
index 9335626a5c90a4..c0766f70cf8815 100644
--- a/clang/test/SemaCXX/constant-expression-cxx1z.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx1z.cpp
@@ -177,3 +177,16 @@ namespace LambdaCallOp {
     p();
   }
 }
+
+// This used to crash due to an assertion failure,
+// see gh#67690
+namespace {
+  struct C {
+    int x;
+  };
+
+  template <const C *p> void f() {
+    const auto &[c] = *p;
+    &c; // expected-warning {{expression result unused}}
+  }
+}
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index a091fadfa3094b..38cc4be32a27c2 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -8,7 +8,7 @@ consteval int f1(int i) {
   return i;
 }
 
-consteval constexpr int f2(int i) { 
+consteval constexpr int f2(int i) {
   //expected-error@-1 {{cannot combine}}
   return i;
 }
@@ -195,7 +195,7 @@ auto ptr = ret1(0);
 struct A {
   consteval int f(int) {
     // expected-note@-1+ {{declared here}}
-    return 0;    
+    return 0;
   }
 };
 
@@ -239,7 +239,7 @@ constexpr int f_c(int i) {
   int t = f(i);
 // expected-error@-1 {{is not a constant expression}}
 // expected-note@-2 {{function parameter}}
-  return f(0);  
+  return f(0);
 }
 
 consteval int f_eval(int i) {
@@ -675,7 +675,7 @@ Bar<derp> a; // expected-note {{in instantiation of member function 'issue_55601
 
 struct constantDerp {
   // Can be used in a constant expression.
-  consteval constantDerp(int) {} 
+  consteval constantDerp(int) {}
   consteval operator int() const { return 5; }
 };
 Bar<constantDerp> b;
@@ -1175,4 +1175,28 @@ struct T {
     static constexpr auto xx = ns::foo(A{}); // expected-error {{cannot take address of consteval function 'foo' outside of an immediate invocation}}
 };
 
+}
+
+namespace GH65520 {
+
+consteval int bar (int i) { if (i != 1) return 1/0; return 0; }
+// expected-note@-1{{division by zero}}
+
+void
+g ()
+{
+  int a_ok[bar(1)];
+  int a_err[bar(3)]; // expected-error {{call to consteval function 'GH65520::bar' is not a constant expression}} \
+                     // expected-note {{in call to 'bar(3)'}}
+}
+
+consteval int undefined(); // expected-note {{declared here}}
+
+consteval void immediate() {
+    int a [undefined()]; // expected-note  {{undefined function 'undefined' cannot be used in a constant expression}} \
+                         // expected-error {{call to consteval function 'GH65520::undefined' is not a constant expression}} \
+                         // expected-error {{variable of non-literal type 'int[undefined()]' cannot be defined in a constexpr function before C++23}}
+}
+
+
 }
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this-compat.cpp b/clang/test/SemaCXX/cxx2b-deducing-this-compat.cpp
new file mode 100644
index 00000000000000..32406dfcac4257
--- /dev/null
+++ b/clang/test/SemaCXX/cxx2b-deducing-this-compat.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 %s -verify
+
+struct S {
+    void f(this auto &a); // expected-error {{explicit object parameters are incompatible with C++ standards before C++2b}}
+};
+
+void f() {
+    (void)[](this auto&a){}; // expected-error {{explicit object parameters are incompatible with C++ standards before C++2b}}
+}
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this-constexpr.cpp b/clang/test/SemaCXX/cxx2b-deducing-this-constexpr.cpp
new file mode 100644
index 00000000000000..9dbea17dd2cae3
--- /dev/null
+++ b/clang/test/SemaCXX/cxx2b-deducing-this-constexpr.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++2b %s -verify
+// expected-no-diagnostics
+
+template <typename Base>
+struct Wrap : Base {
+
+};
+
+struct S {
+    constexpr int f(this const S&) {
+        return 42;
+    }
+    constexpr int f(this const S&, auto&&... args) {
+        return (args + ... + 0);
+    }
+    constexpr int operator[](this const S&) {
+        return 42;
+    }
+    constexpr int operator[](this const S& self, int i) {
+        return i + self.base;
+    }
+    constexpr int operator()(this const S&) {
+        return 42;
+    }
+    constexpr int operator()(this const S& self, int i) {
+        return self.base + i;
+    }
+    constexpr bool operator==(this const S& self, auto && test) {
+        return self.base == test;
+    };
+    constexpr int operator*(this const S& self) {
+        return self.base + 22;
+    };
+    constexpr operator Wrap<S> (this const S& self) {
+        return Wrap<S>{self};
+    };
+    constexpr int operator <<(this Wrap<S> self, int i) {
+        return self.base+i;
+    }
+
+    int base = 20;
+};
+
+consteval void test() {
+    constexpr S s;
+    static_assert(s.f() == 42);
+    static_assert(s[] == 42);
+    static_assert(s[22] == 42);
+    static_assert(s.f() == 42);
+    static_assert(s() == 42);
+    static_assert(s(22) == 42);
+    static_assert(s == 20);
+    static_assert(s != 0);
+    static_assert(*s == 42);
+    static_assert((s << 11) == 31);
+}
+
+namespace GH68070 {
+
+constexpr auto f = [x = 3]<typename Self>(this Self&& self) {
+    return x;
+};
+
+auto g = [x = 3]<typename Self>(this Self&& self) {
+    return x;
+};
+
+int test() {
+  constexpr int a = f();
+  static_assert(a == 3);
+  return f() + g();
+}
+
+}
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this-coro.cpp b/clang/test/SemaCXX/cxx2b-deducing-this-coro.cpp
new file mode 100644
index 00000000000000..dfa50cb75acfab
--- /dev/null
+++ b/clang/test/SemaCXX/cxx2b-deducing-this-coro.cpp
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -std=c++2b %s -fsyntax-only -verify
+
+#include "Inputs/std-coroutine.h"
+
+struct S;
+template <typename T>
+class coro_test {
+public:
+    struct promise_type;
+    using handle = std::coroutine_handle<promise_type>;
+	struct promise_type {
+        promise_type(const promise_type&) = delete; // #copy-ctr
+        promise_type(T);  // #candidate
+        coro_test get_return_object();
+        std::suspend_never initial_suspend();
+	    std::suspend_never final_suspend() noexcept;
+	    void return_void();
+        void unhandled_exception();
+
+
+        template<typename Arg, typename... Args>
+        void* operator new(decltype(0zu) sz, Arg&&, Args&... args) {
+            static_assert(!__is_same(__decay(Arg), S), "Ok"); // expected-error 2{{Ok}}
+        }
+
+    };
+private:
+	handle h;
+};
+
+
+template <typename Ret, typename... P>
+struct std::coroutine_traits<coro_test<S&>, Ret, P...> {
+  using promise_type = coro_test<S&>::promise_type;
+  static_assert(!__is_same(Ret, S&), "Ok"); // expected-error{{static assertion failed due to requirement '!__is_same(S &, S &)': Ok}}
+};
+
+
+struct S {
+
+    coro_test<S&> ok(this S&, int) {
+        co_return; // expected-note {{in instantiation}}
+    }
+
+    coro_test<const S&> ok2(this const S&) { // expected-note {{in instantiation}}
+        co_return;
+    }
+
+    coro_test<int> ko(this const S&) {  // expected-error {{no matching constructor for initialization of 'std::coroutine_traits<coro_test<int>, const S &>::promise_type'}} \
+                                        // expected-note {{in instantiation}} \
+                                       // FIXME: the message below is unhelpful but this is pre-existing
+                                       // expected-note@#candidate {{candidate constructor not viable: requires 1 argument, but 0 were provided}} \
+                                       // expected-note@#copy-ctr  {{candidate constructor not viable}}
+        co_return;
+    }
+
+};
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
new file mode 100644
index 00000000000000..cb83270752443a
--- /dev/null
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -0,0 +1,544 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++2b -Woverloaded-virtual %s -verify
+
+
+// FIXME: can we improve these diagnostics?
+void f(this); // expected-error{{variable has incomplete type 'void'}} \
+              // expected-error{{invalid use of 'this' outside of a non-static member function}}
+
+void g(this auto); // expected-error{{an explicit object parameter cannot appear in a non-member function}}
+
+auto l1 = [] (this auto) static {}; // expected-error{{an explicit object parameter cannot appear in a static lambda}}
+auto l2 = [] (this auto) mutable {}; // expected-error{{a lambda with an explicit object parameter cannot be mutable}}
+auto l3 = [](this auto...){}; // expected-error {{the explicit object parameter cannot be a function parameter pack}}
+auto l4 = [](int, this auto){}; // expected-error {{an explicit object parameter can only appear as the first parameter of the lambda}}
+
+struct S {
+    static void f(this auto); // expected-error{{an explicit object parameter cannot appear in a static function}}
+    virtual void f(this S); // expected-error{{an explicit object parameter cannot appear in a virtual function}}
+
+    void g(this auto) const; // expected-error{{explicit object member function cannot have 'const' qualifier}}
+    void h(this auto) &; // expected-error{{explicit object member function cannot have '&' qualifier}}
+    void i(this auto) &&; // expected-error{{explicit object member function cannot have '&&' qualifier}}
+    void j(this auto) volatile; // expected-error{{explicit object member function cannot have 'volatile' qualifier}}
+    void k(this auto) __restrict; // expected-error{{explicit object member function cannot have '__restrict' qualifier}}
+    void l(this auto) _Nonnull; // expected-error{{explicit object member function cannot have '' qualifie}}
+
+
+    void variadic(this auto...); // expected-error{{the explicit object parameter cannot be a function parameter pack}}
+    void not_first(int, this auto); // expected-error {{an explicit object parameter can only appear as the first parameter of the function}}
+
+    S(this auto); // expected-error {{an explicit object parameter cannot appear in a constructor}}
+    ~S(this S) {} // expected-error {{an explicit object parameter cannot appear in a destructor}} \
+                  // expected-error {{destructor cannot have any parameters}}
+};
+
+namespace Override {
+struct A {
+    virtual void f(); // expected-note 2{{here}}
+    virtual void g(int); // expected-note {{here}}
+    virtual void h() const; // expected-note 5{{here}}
+};
+
+// CWG2553
+struct B : A {
+    int f(this B&, int); // expected-warning {{hides overloaded virtual function}}
+    int f(this B&);  // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+    int g(this B&); // expected-warning {{hides overloaded virtual function}}
+    int h(this B&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+    int h(this B&&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+    int h(this const B&&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+    int h(this A&); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+    int h(this int); // expected-error {{an explicit object parameter cannot appear in a virtual function}}
+};
+}
+
+namespace DefaultArgs {
+     struct Test { void f(this const auto& = Test{}); };
+    // expected-error@-1 {{the explicit object parameter cannot have a default argument}}
+    auto L = [](this const auto& = Test{}){};
+    // expected-error@-1 {{the explicit object parameter cannot have a default argument}}
+}
+
+struct CannotUseThis {
+    int fun();
+    int m;
+    void f(this auto) {
+        this->fun(); // expected-error{{invalid use of 'this' in a function with an explicit object parameter}}
+        fun(); // expected-error {{call to non-static member function without an object argument}}
+        m = 0; // expected-error {{invalid use of member 'm' in explicit object member function}}
+    }
+};
+
+struct CannotUseThisBase {
+  void foo();
+  int n;
+  static int i;
+};
+
+struct CannotUseThisDerived : CannotUseThisBase {
+  void bar(this auto) {
+    foo(); // expected-error {{call to non-static member function without an object argument}}
+    n = 12; // expected-error {{invalid use of member 'n' in explicit object member function}}
+    i = 100;
+  }
+};
+
+namespace ThisInLambdaWithCaptures {
+
+struct Test {
+    Test(auto&&);
+};
+
+void test() {
+
+    [i = 0](this Test) { }();
+    // expected-error@-1 {{invalid explicit object parameter type 'ThisInLambdaWithCaptures::Test' in lambda with capture; the type must be the same as, or derived from, the lambda}}
+
+    struct Derived;
+    auto ok = [i = 0](this const Derived&) {};
+    auto ko = [i = 0](this const Test&) {};
+    // expected-error@-1 {{invalid explicit object parameter type 'ThisInLambdaWithCaptures::Test' in lambda with capture; the type must be the same as, or derived from, the lambda}}
+
+    struct Derived : decltype(ok){};
+    Derived dok{ok};
+    dok();
+
+    struct DerivedErr : decltype(ko){};
+    DerivedErr dko{ko};
+    dko();
+
+    auto alsoOk = [](this const Test &) {};
+    alsoOk();
+}
+
+struct Frobble;
+auto nothingIsOkay = [i = 0](this const Frobble &) {};  // expected-note {{candidate function not viable: requires 0 non-object arguments, but 1 was provided}}
+struct Frobble {} f;
+void test2()  {
+    nothingIsOkay(f); // expected-error {{no matching function for call to object of type}}
+}
+
+}
+
+struct Corresponding {
+    void a(this Corresponding&); // expected-note 2{{here}}
+    void a(); // expected-error{{cannot be redeclared}}
+    void a() &; // expected-error{{cannot be redeclared}}
+    void a(this Corresponding&, int);
+    void a(this Corresponding&, double);
+
+    void b(this const Corresponding&); // expected-note 2{{here}}
+    void b() const; // expected-error{{cannot be redeclared}}
+    void b() const &; // expected-error{{cannot be redeclared}}
+
+    void c(this Corresponding&&); // expected-note {{here}}
+    void c() &&; // expected-error{{cannot be redeclared}}
+
+    void d(this Corresponding&);
+    void d(this Corresponding&&);
+    void d(this const Corresponding&);
+    void d(this const int&);
+    void d(this const int);
+    void d(this int);
+
+    void e(this const Corresponding&&); // expected-note {{here}}
+    void e() const &&; // expected-error{{cannot be redeclared}}
+
+};
+
+template <typename T>
+struct CorrespondingTpl {
+    void a(this CorrespondingTpl&); // expected-note 2{{here}}
+    void a(); // expected-error{{cannot be redeclared}}
+    void a() &; // expected-error{{cannot be redeclared}}
+    void a(this Corresponding&, int);
+    void a(this Corresponding&, double);
+    void a(long);
+
+
+    void b(this const CorrespondingTpl&); // expected-note 2{{here}}
+    void b() const; // expected-error{{cannot be redeclared}}
+    void b() const &; // expected-error{{cannot be redeclared}}
+
+    void c(this CorrespondingTpl&&); // expected-note {{here}}
+    void c() &&; // expected-error{{cannot be redeclared}}
+
+    void d(this Corresponding&);
+    void d(this Corresponding&&);
+    void d(this const Corresponding&);
+    void d(this const int&);
+    void d(this const int);
+    void d(this int);
+
+    void e(this const CorrespondingTpl&&); // expected-note {{here}}
+    void e() const &&; // expected-error{{cannot be redeclared}}
+};
+
+struct C {
+    template <typename T>
+    C(T){}
+};
+
+void func(int i) {
+    (void)[=](this auto&&) { return i; }();
+    (void)[=](this const auto&) { return i; }();
+    (void)[i](this C) { return i; }(); // expected-error{{invalid explicit object parameter type 'C'}}
+    (void)[=](this C) { return i; }(); // expected-error{{invalid explicit object parameter type 'C'}}
+    (void)[](this C) { return 42; }();
+    auto l = [=](this auto&) {};
+    struct D : decltype(l) {};
+    D d{l};
+    d();
+}
+
+void TestMutationInLambda() {
+    [i = 0](this auto &&){ i++; }();
+    [i = 0](this auto){ i++; }();
+    [i = 0](this const auto&){ i++; }();
+    // expected-error@-1 {{cannot assign to a variable captured by copy in a non-mutable lambda}}
+}
+
+struct Over_Call_Func_Example {
+    void a();
+    void b() {
+        a(); // ok, (*this).a()
+    }
+
+    void f(this const Over_Call_Func_Example&); // expected-note {{here}}
+    void g() const {
+        f();       // ok: (*this).f()
+        f(*this);  // expected-error{{too many non-object arguments to function call}}
+        this->f(); // ok
+    }
+
+    static void h() {
+        f();       // expected-error{{call to non-static member function without an object argument}}
+        f(Over_Call_Func_Example{});   // expected-error{{call to non-static member function without an object argument}}
+        Over_Call_Func_Example{}.f();   // ok
+    }
+
+    void k(this int);
+    operator int() const;
+    void m(this const Over_Call_Func_Example& c) {
+        c.k();     // ok
+    }
+};
+
+struct AmbiguousConversion {
+  void f(this int); // expected-note {{candidate function}}
+  void f(this float); // expected-note {{candidate function}}
+
+  operator int() const;
+  operator float() const;
+
+  void test(this const AmbiguousConversion &s) {
+    s.f(); // expected-error {{call to member function 'f' is ambiguous}}
+  }
+};
+
+struct IntToShort {
+  void s(this short);
+  operator int() const;
+  void test(this const IntToShort &val) {
+    val.s();
+  }
+};
+
+struct ShortToInt {
+  void s(this int);
+  operator short() const;
+  void test(this const ShortToInt &val) {
+    val.s();
+  }
+};
+
+namespace arity_diagnostics {
+struct S {
+    void f(this auto &&, auto, auto); // expected-note {{requires 2 non-object arguments, but 0 were provided}}
+    void g(this auto &&, auto, auto); // expected-note {{requires 2 non-object arguments, but 3 were provided}}
+    void h(this auto &&, int, int i = 0); // expected-note {{requires at least 1 non-object argument, but 0 were provided}}
+    void i(this S&&, int); // expected-note 2{{declared here}}
+};
+
+int test() {
+    void(*f)(S&&, int, int) = &S::f;
+    f(S{}, 1, 2);
+    f(S{}, 1); // expected-error {{too few arguments to function call, expected 3, have 2}}
+    f(S{}); // expected-error {{too few arguments to function call, expected 3, have 1}}
+    f(S{}, 1, 2, 3); //expected-error {{too many arguments to function call, expected 3, have 4}}
+
+    S{}.f(1, 2);
+    S{}.f(); //  expected-error{{no matching member function for call to 'f'}}
+    S{}.g(1,2,3); // expected-error {{no matching member function for call to 'g'}}
+    S{}.h(); // expected-error {{no matching member function for call to 'h'}}
+    S{}.i(); // expected-error {{too few non-object arguments to function call, expected 1, have 0}}
+    S{}.i(1, 2, 3); // expected-error {{too many non-object arguments to function call, expected 1, have 3}}
+}
+
+}
+
+namespace AddressOf {
+
+struct s {
+    static void f(int);
+    void f(this auto &&) {}
+    void g(this s &&) {};
+
+    void test_qual() {
+        using F = void(s&&);
+        F* a = &f; // expected-error {{must explicitly qualify name of member function when taking its address}}
+        F* b = &g; // expected-error {{must explicitly qualify name of member function when taking its address}}
+        F* c = &s::f;
+        F* d = &s::g;
+    }
+};
+
+void test() {
+    using F = void(s&&);
+    F* a = &s::f;
+    F* b = &s::g;
+    a(s{});
+    b(s{});
+}
+
+}
+
+namespace std {
+  struct strong_ordering {
+    int n;
+    constexpr operator int() const { return n; }
+    static const strong_ordering equal, greater, less;
+  };
+  constexpr strong_ordering strong_ordering::equal = {0};
+  constexpr strong_ordering strong_ordering::greater = {1};
+  constexpr strong_ordering strong_ordering::less = {-1};
+}
+
+namespace operators_deduction {
+
+template <typename T, typename U>
+constexpr bool is_same = false;
+
+template <typename T>
+constexpr bool is_same<T, T> = true;
+
+template <template <typename> typename T>
+struct Wrap {
+void f();
+struct S {
+    operator int(this auto&& self) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return 0;
+    }
+    Wrap* operator->(this auto&& self) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return new Wrap();
+    }
+    int operator[](this auto&& self, int) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return 0;
+    }
+    int operator()(this auto&& self, int) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return 0;
+    }
+    int operator++(this auto&& self, int) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return 0;
+    }
+    int operator++(this auto&& self) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return 0;
+    }
+    int operator--(this auto&& self, int) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return 0;
+    }
+    int operator--(this auto&& self) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return 0;
+    }
+    int operator*(this auto&& self) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return 0;
+    }
+    bool operator==(this auto&& self, int) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return false;
+    }
+    bool operator<=>(this auto&& self, int) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return false;
+    }
+    bool operator<<(this auto&& self, int b) {
+        static_assert(is_same<decltype(self), typename T<S>::type>);
+        return false;
+    }
+};
+};
+
+template <typename T>
+struct lvalue_reference {
+    using type = T&;
+};
+template <typename T>
+struct const_lvalue_reference {
+    using type = const T&;
+};
+template <typename T>
+struct volatile_lvalue_reference {
+    using type = volatile T&;
+};
+template <typename T>
+struct rvalue_reference {
+    using type = T&&;
+};
+template <typename T>
+struct const_rvalue_reference {
+    using type = const T&&;
+};
+
+
+void test() {
+    {
+        Wrap<lvalue_reference>::S s;
+        s++;
+        s.operator++(0);
+        ++s;
+        s.operator++();
+        s--;
+        s.operator--(0);
+        --s;
+        s.operator--();
+        s[0];
+        s.operator[](0);
+        s(0);
+        s.operator()(0);
+        *s;
+        s.operator*();
+        s->f();
+        s.operator->();
+        int i = s;
+        (void)(s << 0);
+        s.operator<<(0);
+        (void)(s == 0);
+        s.operator==(0);
+        (void)(s <=> 0);
+        s.operator<=>(0);
+    }
+    {
+        const Wrap<const_lvalue_reference>::S s;
+        s++;
+        s.operator++(0);
+        ++s;
+        s.operator++();
+        s--;
+        s.operator--(0);
+        --s;
+        s.operator--();
+        s[0];
+        s.operator[](0);
+        s(0);
+        s.operator()(0);
+        *s;
+        s.operator*();
+        s->f();
+        s.operator->();
+        int i = s;
+        (void)(s << 0);
+        s.operator<<(0);
+        (void)(s == 0);
+        s.operator==(0);
+        (void)(s <=> 0);
+        s.operator<=>(0);
+    }
+    {
+        volatile Wrap<volatile_lvalue_reference>::S s;
+        s++;
+        s.operator++(0);
+        ++s;
+        s.operator++();
+        s--;
+        s.operator--(0);
+        --s;
+        s.operator--();
+        s[0];
+        s.operator[](0);
+        s(0);
+        s.operator()(0);
+        *s;
+        s.operator*();
+        s->f();
+        s.operator->();
+        int i = s;
+        (void)(s << 0);
+        s.operator<<(0);
+        (void)(s == 0);
+        s.operator==(0);
+        (void)(s <=> 0);
+        s.operator<=>(0);
+    }
+    {
+        Wrap<rvalue_reference>::S s;
+        using M = Wrap<rvalue_reference>::S&&;
+        ((M)s)++;
+        ((M)s).operator++(0);
+        ++((M)s);
+        ((M)s).operator++();
+        ((M)s)--;
+        ((M)s).operator--(0);
+        --((M)s);
+        ((M)s).operator--();
+        ((M)s)[0];
+        ((M)s).operator[](0);
+        ((M)s)(0);
+        ((M)s).operator()(0);
+        *((M)s);
+        ((M)s).operator*();
+        ((M)s)->f();
+        ((M)s).operator->();
+        int i = ((M)s);
+        (void)(((M)s) << 0);
+        ((M)s).operator<<(0);
+        (void)(((M)s) == 0);
+        ((M)s).operator==(0);
+        (void)(((M)s) <=> 0);
+        ((M)s).operator<=>(0);
+    }
+}
+}
+
+namespace conversions {
+//[over.best.ics]
+struct Y { Y(int); }; //expected-note 3{{candidate}}
+struct A { operator int(this auto&&); };  //expected-note {{candidate}}
+Y y1 = A();   // expected-error{{no viable conversion from 'A' to 'Y'}}
+
+struct X { X(); }; //expected-note 3{{candidate}}
+struct B { operator X(this auto&&); };
+B b;
+X x{{b}}; // expected-error{{no matching constructor for initialization of 'X'}}
+
+struct T{}; // expected-note 2{{candidate constructor}}
+struct C {
+    operator T (this int); // expected-note {{candidate function not viable: no known conversion from 'C' to 'int' for object argument}}
+    operator int() const; // expected-note {{candidate function}}
+};
+
+void foo(C c) {
+   T d = c; // expected-error {{no viable conversion from 'C' to 'T'}}
+}
+
+}
+
+namespace surrogate {
+using fn_t = void();
+struct C {
+    operator fn_t * (this C const &);
+};
+
+void foo(C c) {
+   c();
+}
+
+}
diff --git a/clang/test/SemaCXX/gh65522.cpp b/clang/test/SemaCXX/gh65522.cpp
new file mode 100644
index 00000000000000..2d6331b0372a31
--- /dev/null
+++ b/clang/test/SemaCXX/gh65522.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -std=c++20 -Wc++17-compat -verify -Wno-unused %s
+
+class X {};
+
+template<typename T>
+class B3 { // expected-note {{candidate template ignored: could not match 'B3<T>' against 'int'}}
+  template<X x> B3(T); // expected-warning 2{{non-type template parameter of type 'X' is incompatible with C++ standards before C++20}} \
+                       // expected-note {{candidate template ignored: couldn't infer template argument 'x'}}
+};
+B3 b3 = 0; // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'B3'}} \
+           // expected-note {{while building implicit deduction guide first needed here}}
diff --git a/clang/test/SemaCXX/pr64462.cpp b/clang/test/SemaCXX/pr64462.cpp
new file mode 100644
index 00000000000000..7cffb3583dcd40
--- /dev/null
+++ b/clang/test/SemaCXX/pr64462.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -verify -std=c++20 -fsyntax-only %s
+
+auto c1(auto f, auto ...fs) {
+  constexpr bool a = true;
+  return [](auto) requires a {
+    constexpr bool b = true;
+    return []() requires a && b {
+      constexpr bool c = true;
+      return [](auto) requires a && b && c {
+        constexpr bool d = true;
+        // expected-note@+2{{because substituted constraint expression is ill-formed: no matching function for call to 'c1'}}
+        // expected-note@+1{{candidate function not viable: constraints not satisfied}}
+        return []() requires a && b && c && d && (c1(fs...)) {};
+      };
+    }();
+  }(1);
+}
+
+void foo() {
+  c1(true)(1.0)(); // expected-error{{no matching function for call to object of type}}
+}
diff --git a/clang/test/SemaCXX/static-assert-cxx26.cpp b/clang/test/SemaCXX/static-assert-cxx26.cpp
index c2bb2d2ebc6ffb..b19aac6cabd132 100644
--- a/clang/test/SemaCXX/static-assert-cxx26.cpp
+++ b/clang/test/SemaCXX/static-assert-cxx26.cpp
@@ -298,3 +298,12 @@ Good<Frobble> a; // expected-note {{in instantiation}}
 Bad<int> b; // expected-note {{in instantiation}}
 
 }
+
+namespace EscapeInDiagnostic {
+static_assert('\u{9}' == (char)1, ""); // expected-error {{failed}} \
+                                       // expected-note {{evaluates to ''\t' (0x09, 9) == '<U+0001>' (0x01, 1)'}}
+static_assert((char8_t)-128 == (char8_t)-123, ""); // expected-error {{failed}} \
+                                                   // expected-note {{evaluates to 'u8'<80>' (0x80, 128) == u8'<85>' (0x85, 133)'}}
+static_assert((char16_t)0xFEFF == (char16_t)0xDB93, ""); // expected-error {{failed}} \
+                                                         // expected-note {{evaluates to 'u'﻿' (0xFEFF, 65279) == u'\xDB93' (0xDB93, 56211)'}}
+}
diff --git a/clang/test/SemaCXX/static-assert.cpp b/clang/test/SemaCXX/static-assert.cpp
index 5d64ff682a20e5..4200d821339ede 100644
--- a/clang/test/SemaCXX/static-assert.cpp
+++ b/clang/test/SemaCXX/static-assert.cpp
@@ -268,7 +268,31 @@ namespace Diagnostics {
     return 'c';
   }
   static_assert(getChar() == 'a', ""); // expected-error {{failed}} \
-                                       // expected-note {{evaluates to ''c' == 'a''}}
+                                       // expected-note {{evaluates to ''c' (0x63, 99) == 'a' (0x61, 97)'}}
+  static_assert((char)9 == '\x61', ""); // expected-error {{failed}} \
+                                        // expected-note {{evaluates to ''\t' (0x09, 9) == 'a' (0x61, 97)'}}
+  static_assert((char)10 == '\0', ""); // expected-error {{failed}} \
+                                       // expected-note {{n' (0x0A, 10) == '<U+0000>' (0x00, 0)'}}
+  // The note above is intended to match "evaluates to '\n' (0x0A, 10) == '<U+0000>' (0x00, 0)'", but if we write it as it is,
+  // the "\n" cannot be consumed by the diagnostic consumer.
+  static_assert((signed char)10 == (char)-123, ""); // expected-error {{failed}} \
+                                                    // expected-note {{evaluates to '10 == '<85>' (0x85, -123)'}}
+  static_assert((char)-4 == (unsigned char)-8, ""); // expected-error {{failed}} \
+                                                    // expected-note {{evaluates to ''<FC>' (0xFC, -4) == 248'}}
+  static_assert((char)-128 == (char)-123, ""); // expected-error {{failed}} \
+                                               // expected-note {{evaluates to ''<80>' (0x80, -128) == '<85>' (0x85, -123)'}}
+  static_assert('\xA0' == (char)'\x20', ""); // expected-error {{failed}} \
+                                             // expected-note {{evaluates to ''<A0>' (0xA0, -96) == ' ' (0x20, 32)'}}
+  static_assert((char16_t)L'ゆ' == L"C̵̭̯̠̎͌ͅť̺"[1], ""); // expected-error {{failed}} \
+                                                  // expected-note {{evaluates to 'u'ゆ' (0x3086, 12422) == L'̵' (0x335, 821)'}}
+  static_assert(L"＼／"[1] == u'\xFFFD', ""); // expected-error {{failed}} \
+                                              // expected-note {{evaluates to 'L'／' (0xFF0F, 65295) == u'�' (0xFFFD, 65533)'}}
+  static_assert(L"⚾"[0] == U'🌍', ""); // expected-error {{failed}} \
+                                         // expected-note {{evaluates to 'L'⚾' (0x26BE, 9918) == U'🌍' (0x1F30D, 127757)'}}
+  static_assert(U"\a"[0] == (wchar_t)9, ""); // expected-error {{failed}} \
+                                             // expected-note {{evaluates to 'U'\a' (0x07, 7) == L'\t' (0x09, 9)'}}
+  static_assert(L"§"[0] == U'Ö', ""); // expected-error {{failed}} \
+                                      // expected-note {{evaluates to 'L'§' (0xA7, 167) == U'Ö' (0xD6, 214)'}}
 
   /// Bools are printed as bools.
   constexpr bool invert(bool b) {
diff --git a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
index 8e312e589d8116..205cfa284f6c9c 100644
--- a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -5580,6 +5580,85 @@ class Bar {
   }
 };
 
+class Return {
+  Mutex mu;
+  Foo foo GUARDED_BY(mu);
+  Foo* foo_ptr PT_GUARDED_BY(mu);
+
+  Foo returns_value_locked() {
+    MutexLock lock(&mu);
+    return foo;
+  }
+
+  Foo returns_value_locks_required() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    return foo;
+  }
+
+  Foo returns_value_releases_lock_after_return() UNLOCK_FUNCTION(mu) {
+    MutexLock lock(&mu, true);
+    return foo;
+  }
+
+  Foo returns_value_aquires_lock() EXCLUSIVE_LOCK_FUNCTION(mu) {
+    mu.Lock();
+    return foo;
+  }
+  
+  Foo returns_value_not_locked() {
+    return foo;               // expected-warning {{reading variable 'foo' requires holding mutex 'mu'}}
+  }
+  
+  Foo returns_value_releases_lock_before_return() UNLOCK_FUNCTION(mu) {
+    mu.Unlock();
+    return foo;               // expected-warning {{reading variable 'foo' requires holding mutex 'mu'}}
+  }
+
+  Foo &returns_ref_not_locked() {
+    return foo;               // expected-warning {{returning variable 'foo' by reference requires holding mutex 'mu'}}
+  }
+
+  Foo &returns_ref_locked() {
+    MutexLock lock(&mu);
+    return foo;               // expected-warning {{returning variable 'foo' by reference requires holding mutex 'mu'}}
+  }
+
+  Foo &returns_ref_shared_locks_required() SHARED_LOCKS_REQUIRED(mu) {
+    return foo;               // expected-warning {{returning variable 'foo' by reference requires holding mutex 'mu' exclusively}}
+  }
+
+  Foo &returns_ref_exclusive_locks_required() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    return foo;
+  }
+
+  Foo &returns_ref_releases_lock_after_return() UNLOCK_FUNCTION(mu) {
+    MutexLock lock(&mu, true);
+    return foo;               // expected-warning {{returning variable 'foo' by reference requires holding mutex 'mu' exclusively}}
+  }
+
+  Foo& returns_ref_releases_lock_before_return() UNLOCK_FUNCTION(mu) {
+    mu.Unlock();
+    return foo;               // // expected-warning {{returning variable 'foo' by reference requires holding mutex 'mu' exclusively}}
+  }
+  
+  Foo &returns_ref_aquires_lock() EXCLUSIVE_LOCK_FUNCTION(mu) {
+    mu.Lock();
+    return foo;
+  }
+  
+  const Foo &returns_constref_shared_locks_required() SHARED_LOCKS_REQUIRED(mu) {
+    return foo;
+  }
+  
+  Foo *returns_ptr() {
+    return &foo;              // FIXME -- Do we want to warn on this ?
+  }
+
+  Foo &returns_ref2() {
+    return *foo_ptr;          // expected-warning {{returning the value that 'foo_ptr' points to by reference requires holding mutex 'mu' exclusively}}
+  }
+
+};
+
 
 }  // end namespace PassByRefTest
 
diff --git a/clang/test/SemaHipStdPar/device-can-call-host.cpp b/clang/test/SemaHipStdPar/device-can-call-host.cpp
new file mode 100644
index 00000000000000..3fedc179251d28
--- /dev/null
+++ b/clang/test/SemaHipStdPar/device-can-call-host.cpp
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 -x hip %s --hipstdpar -triple amdgcn-amd-amdhsa --std=c++17 \
+// RUN:   -fcuda-is-device -emit-llvm -o /dev/null -verify
+
+// Note: These would happen implicitly, within the implementation of the
+//       accelerator specific algorithm library, and not from user code.
+
+// Calls from the accelerator side to implicitly host (i.e. unannotated)
+// functions are fine.
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+extern "C" void host_fn() {}
+
+struct Dummy {};
+
+struct S {
+  S() {}
+  ~S() { host_fn(); }
+
+  int x;
+};
+
+struct T {
+  __device__ void hd() { host_fn(); }
+
+  __device__ void hd3();
+
+  void h() {}
+
+  void operator+();
+  void operator-(const T&) {}
+
+  operator Dummy() { return Dummy(); }
+};
+
+__device__ void T::hd3() { host_fn(); }
+
+template <typename T> __device__ void hd2() { host_fn(); }
+
+__global__ void kernel() { hd2<int>(); }
+
+__device__ void hd() { host_fn(); }
+
+template <typename T> __device__ void hd3() { host_fn(); }
+__device__ void device_fn() { hd3<int>(); }
+
+__device__ void local_var() {
+  S s;
+}
+
+__device__ void explicit_destructor(S *s) {
+  s->~S();
+}
+
+__device__ void hd_member_fn() {
+  T t;
+
+  t.hd();
+}
+
+__device__ void h_member_fn() {
+  T t;
+  t.h();
+}
+
+__device__ void unaryOp() {
+  T t;
+  (void) +t;
+}
+
+__device__ void binaryOp() {
+  T t;
+  (void) (t - t);
+}
+
+__device__ void implicitConversion() {
+  T t;
+  Dummy d = t;
+}
+
+template <typename T>
+struct TmplStruct {
+  template <typename U> __device__ void fn() {}
+};
+
+template <>
+template <>
+__device__ void TmplStruct<int>::fn<int>() { host_fn(); }
+
+__device__ void double_specialization() { TmplStruct<int>().fn<int>(); }
diff --git a/clang/test/SemaObjCXX/ivar-struct.mm b/clang/test/SemaObjCXX/ivar-struct.mm
index c8c9ca9cbbf04f..4a039a98abea6c 100644
--- a/clang/test/SemaObjCXX/ivar-struct.mm
+++ b/clang/test/SemaObjCXX/ivar-struct.mm
@@ -1,8 +1,21 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// expected-no-diagnostics
+// RUN: %clang_cc1 -fsyntax-only -Wno-objc-root-class -verify %s
 @interface A {
   struct X {
     int x, y;
   } X;
 }
 @end
+
+static const uint32_t Count = 16; // expected-error {{unknown type name 'uint32_t'}}
+
+struct S0 {
+  S0();
+};
+
+@interface C0
+@end
+
+@implementation C0 {
+  S0 ivar0[Count];
+}
+@end
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
index 6ab9ecbfd573b2..22788e1c1ffb3a 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
@@ -603,3 +603,15 @@ namespace PR47792 {
   template void bar<>();    // expected-note {{previous explicit instantiation is here}}
   template void bar<foo>(); // expected-error {{duplicate explicit instantiation of 'bar<&PR47792::foo>'}}
 }
+
+namespace GH68024 {
+template<auto>
+struct s {};
+
+struct {
+  void operator()(int);
+} f;
+
+template<typename T>
+using a = s<f(T::x)>;
+}
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 632e37e3cac8fe..f95b0f8cb317c7 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -595,6 +595,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
   StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
 
   SmallVector<OffloadFile, 4> BitcodeInputFiles;
+  DenseSet<StringRef> StrongResolutions;
   DenseSet<StringRef> UsedInRegularObj;
   DenseSet<StringRef> UsedInSharedLib;
   BumpPtrAllocator Alloc;
@@ -608,6 +609,18 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
     file_magic Type = identify_magic(Buffer.getBuffer());
     switch (Type) {
     case file_magic::bitcode: {
+      Expected<IRSymtabFile> IRSymtabOrErr = readIRSymtab(Buffer);
+      if (!IRSymtabOrErr)
+        return IRSymtabOrErr.takeError();
+
+      // Check for any strong resolutions we need to preserve.
+      for (unsigned I = 0; I != IRSymtabOrErr->Mods.size(); ++I) {
+        for (const auto &Sym : IRSymtabOrErr->TheReader.module_symbols(I)) {
+          if (!Sym.isFormatSpecific() && Sym.isGlobal() && !Sym.isWeak() &&
+              !Sym.isUndefined())
+            StrongResolutions.insert(Saver.save(Sym.Name));
+        }
+      }
       BitcodeInputFiles.emplace_back(std::move(File));
       continue;
     }
@@ -696,6 +709,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
       // it is undefined or another definition has already been used.
       Res.Prevailing =
           !Sym.isUndefined() &&
+          !(Sym.isWeak() && StrongResolutions.contains(Sym.getName())) &&
           PrevailingSymbols.insert(Saver.save(Sym.getName())).second;
 
       // We need LTO to preseve the following global symbols:
diff --git a/clang/tools/clang-offload-bundler/CMakeLists.txt b/clang/tools/clang-offload-bundler/CMakeLists.txt
index dabd82382cdf08..dec2881589a538 100644
--- a/clang/tools/clang-offload-bundler/CMakeLists.txt
+++ b/clang/tools/clang-offload-bundler/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  BinaryFormat
   Object
   Support
   TargetParser
diff --git a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
index f20157622d79f3..90c475e541f4c3 100644
--- a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -136,6 +136,11 @@ int main(int argc, const char **argv) {
     cl::desc("Treat hip and hipv4 offload kinds as "
              "compatible with openmp kind, and vice versa.\n"),
     cl::init(false), cl::cat(ClangOffloadBundlerCategory));
+  cl::opt<bool> Compress("compress",
+                         cl::desc("Compress output file when bundling.\n"),
+                         cl::init(false), cl::cat(ClangOffloadBundlerCategory));
+  cl::opt<bool> Verbose("verbose", cl::desc("Print debug information.\n"),
+                        cl::init(false), cl::cat(ClangOffloadBundlerCategory));
 
   // Process commandline options and report errors
   sys::PrintStackTraceOnErrorSignal(argv[0]);
@@ -163,6 +168,11 @@ int main(int argc, const char **argv) {
   BundlerConfig.BundleAlignment = BundleAlignment;
   BundlerConfig.FilesType = FilesType;
   BundlerConfig.ObjcopyPath = "";
+  // Do not override the default value Compress and Verbose in BundlerConfig.
+  if (Compress.getNumOccurrences() > 0)
+    BundlerConfig.Compress = Compress;
+  if (Verbose.getNumOccurrences() > 0)
+    BundlerConfig.Verbose = Verbose;
 
   BundlerConfig.TargetNames = TargetNames;
   BundlerConfig.InputFileNames = InputFileNames;
diff --git a/clang/tools/clang-rename/ClangRename.cpp b/clang/tools/clang-rename/ClangRename.cpp
index 24c9d8521dc270..f2ac0c4360e0dc 100644
--- a/clang/tools/clang-rename/ClangRename.cpp
+++ b/clang/tools/clang-rename/ClangRename.cpp
@@ -228,7 +228,7 @@ int main(int argc, const char **argv) {
 
     Tool.applyAllReplacements(Rewrite);
     for (const auto &File : Files) {
-      auto Entry = FileMgr.getFile(File);
+      auto Entry = FileMgr.getOptionalFileRef(File);
       if (!Entry) {
         errs() << "clang-rename: " << File << " does not exist.\n";
         return 1;
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index f0c8ecfcb6264f..7915050a78eaff 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2735,6 +2735,7 @@ void OMPClauseEnqueue::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
 }
 void OMPClauseEnqueue::VisitOMPXAttributeClause(const OMPXAttributeClause *C) {
 }
+void OMPClauseEnqueue::VisitOMPXBareClause(const OMPXBareClause *C) {}
 
 } // namespace
 
diff --git a/clang/unittests/AST/DeclTest.cpp b/clang/unittests/AST/DeclTest.cpp
index d2977b0cb55b64..474d3dadb5a52c 100644
--- a/clang/unittests/AST/DeclTest.cpp
+++ b/clang/unittests/AST/DeclTest.cpp
@@ -135,8 +135,8 @@ TEST(Decl, MangleDependentSizedArray) {
   std::unique_ptr<ItaniumMangleContext> MC(
       ItaniumMangleContext::create(Ctx, Diags));
 
-  MC->mangleTypeName(DeclA->getType(), OS_A);
-  MC->mangleTypeName(DeclB->getType(), OS_B);
+  MC->mangleCanonicalTypeName(DeclA->getType(), OS_A);
+  MC->mangleCanonicalTypeName(DeclB->getType(), OS_B);
 
   ASSERT_TRUE(0 == MangleA.compare("_ZTSA_i"));
   ASSERT_TRUE(0 == MangleB.compare("_ZTSAT0__T_"));
diff --git a/clang/unittests/Analysis/MacroExpansionContextTest.cpp b/clang/unittests/Analysis/MacroExpansionContextTest.cpp
index 5c694c836a4881..54b209e7b28c18 100644
--- a/clang/unittests/Analysis/MacroExpansionContextTest.cpp
+++ b/clang/unittests/Analysis/MacroExpansionContextTest.cpp
@@ -73,12 +73,7 @@ class MacroExpansionContextTest : public ::testing::Test {
     // Lex source text.
     PP.EnterMainSourceFile();
 
-    while (true) {
-      Token Tok;
-      PP.Lex(Tok);
-      if (Tok.is(tok::eof))
-        break;
-    }
+    PP.LexTokensUntilEOF();
 
     // Callbacks have been executed at this point.
     return Ctx;
diff --git a/clang/unittests/Analysis/UnsafeBufferUsageTest.cpp b/clang/unittests/Analysis/UnsafeBufferUsageTest.cpp
index 66af5750ef2424..e48f39bf13f449 100644
--- a/clang/unittests/Analysis/UnsafeBufferUsageTest.cpp
+++ b/clang/unittests/Analysis/UnsafeBufferUsageTest.cpp
@@ -25,7 +25,7 @@ class UnsafeBufferUsageTest : public ::testing::Test {
 } // namespace
 
 TEST_F(UnsafeBufferUsageTest, FixItHintsConflict) {
-  const FileEntry *DummyFile = FileMgr.getVirtualFile("<virtual>", 100, 0);
+  FileEntryRef DummyFile = FileMgr.getVirtualFileRef("<virtual>", 100, 0);
   FileID DummyFileID = SourceMgr.getOrCreateFileID(DummyFile, SrcMgr::C_User);
   SourceLocation StartLoc = SourceMgr.getLocForStartOfFile(DummyFileID);
 
diff --git a/clang/unittests/Basic/SourceManagerTest.cpp b/clang/unittests/Basic/SourceManagerTest.cpp
index f451e43bb53d78..cd2bd12106b0ed 100644
--- a/clang/unittests/Basic/SourceManagerTest.cpp
+++ b/clang/unittests/Basic/SourceManagerTest.cpp
@@ -138,13 +138,7 @@ TEST_F(SourceManagerTest, isBeforeInTranslationUnit) {
   PP.EnterMainSourceFile();
 
   std::vector<Token> toks;
-  while (1) {
-    Token tok;
-    PP.Lex(tok);
-    if (tok.is(tok::eof))
-      break;
-    toks.push_back(tok);
-  }
+  PP.LexTokensUntilEOF(&toks);
 
   // Make sure we got the tokens that we expected.
   ASSERT_EQ(3U, toks.size());
@@ -195,13 +189,7 @@ TEST_F(SourceManagerTest, isBeforeInTranslationUnitWithTokenSplit) {
   llvm::SmallString<8> Scratch;
 
   std::vector<Token> toks;
-  while (1) {
-    Token tok;
-    PP.Lex(tok);
-    if (tok.is(tok::eof))
-      break;
-    toks.push_back(tok);
-  }
+  PP.LexTokensUntilEOF(&toks);
 
   // Make sure we got the tokens that we expected.
   ASSERT_EQ(4U, toks.size()) << "a >> b c";
@@ -452,13 +440,7 @@ TEST_F(SourceManagerTest, getMacroArgExpandedLocation) {
   PP.EnterMainSourceFile();
 
   std::vector<Token> toks;
-  while (1) {
-    Token tok;
-    PP.Lex(tok);
-    if (tok.is(tok::eof))
-      break;
-    toks.push_back(tok);
-  }
+  PP.LexTokensUntilEOF(&toks);
 
   // Make sure we got the tokens that we expected.
   ASSERT_EQ(4U, toks.size());
@@ -574,13 +556,7 @@ TEST_F(SourceManagerTest, isBeforeInTranslationUnitWithMacroInInclude) {
   PP.EnterMainSourceFile();
 
   std::vector<Token> toks;
-  while (1) {
-    Token tok;
-    PP.Lex(tok);
-    if (tok.is(tok::eof))
-      break;
-    toks.push_back(tok);
-  }
+  PP.LexTokensUntilEOF(&toks);
 
   // Make sure we got the tokens that we expected.
   ASSERT_EQ(0U, toks.size());
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 0403de8a9a6559..2ef3c9b299bcad 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -10622,6 +10622,12 @@ TEST_F(FormatTest, WrapsAtNestedNameSpecifiers) {
   verifyFormat("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa::\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
                "        .aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa();");
+
+  verifyFormat(
+      "LongClassNameToShowTheIssue::AndAnotherLongClassNameToShowTheIssue::\n"
+      "    AndAnotherLongClassNameToShowTheIssue() {}\n"
+      "LongClassNameToShowTheIssue::AndAnotherLongClassNameToShowTheIssue::\n"
+      "    ~AndAnotherLongClassNameToShowTheIssue() {}");
 }
 
 TEST_F(FormatTest, UnderstandsTemplateParameters) {
@@ -16339,7 +16345,7 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeParens) {
 
   verifyFormat("int f();", SpaceFuncDef);
   verifyFormat("void f (int a, T b) {}", SpaceFuncDef);
-  verifyFormat("A::A () : a(1) {}", SpaceFuncDef);
+  verifyFormat("A::A() : a(1) {}", SpaceFuncDef);
   verifyFormat("void f() __attribute__((asdf));", SpaceFuncDef);
   verifyFormat("#define A(x) x", SpaceFuncDef);
   verifyFormat("#define A (x) x", SpaceFuncDef);
@@ -16364,7 +16370,7 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeParens) {
   // verifyFormat("T A::operator() () {}", SpaceFuncDef);
   verifyFormat("auto lambda = [] () { return 0; };", SpaceFuncDef);
   verifyFormat("int x = int(y);", SpaceFuncDef);
-  verifyFormat("M (std::size_t R, std::size_t C) : C(C), data(R) {}",
+  verifyFormat("M(std::size_t R, std::size_t C) : C(C), data(R) {}",
                SpaceFuncDef);
 
   FormatStyle SpaceIfMacros = getLLVMStyle();
@@ -18525,12 +18531,17 @@ TEST_F(FormatTest, AlignConsecutiveAssignments) {
                "                     a_longer_name_for_wrap}};",
                Alignment);
 
-  Alignment.ColumnLimit = 60;
+  Alignment = getLLVMStyleWithColumns(60);
+  Alignment.AlignConsecutiveAssignments.Enabled = true;
   verifyFormat("using II = typename TI<T, std::tuple<Types...>>::I;\n"
                "using I  = std::conditional_t<II::value >= 0,\n"
                "                              std::ic<int, II::value + 1>,\n"
                "                              std::ic<int, -1>>;",
                Alignment);
+  verifyFormat("SomeName = Foo;\n"
+               "X        = func<Type, Type>(looooooooooooooooooooooooong,\n"
+               "                            arrrrrrrrrrg);",
+               Alignment);
 }
 
 TEST_F(FormatTest, AlignConsecutiveBitFields) {
@@ -26305,6 +26316,56 @@ TEST_F(FormatTest, RemoveParentheses) {
   verifyFormat("co_return 0;", "co_return ((0));", Style);
   verifyFormat("return 0;", "return (((0)));", Style);
   verifyFormat("return ({ 0; });", "return ((({ 0; })));", Style);
+  verifyFormat("inline decltype(auto) f() {\n"
+               "  if (a) {\n"
+               "    return (a);\n"
+               "  }\n"
+               "  return (b);\n"
+               "}",
+               "inline decltype(auto) f() {\n"
+               "  if (a) {\n"
+               "    return ((a));\n"
+               "  }\n"
+               "  return ((b));\n"
+               "}",
+               Style);
+  verifyFormat("auto g() {\n"
+               "  decltype(auto) x = [] {\n"
+               "    auto y = [] {\n"
+               "      if (a) {\n"
+               "        return a;\n"
+               "      }\n"
+               "      return b;\n"
+               "    };\n"
+               "    if (c) {\n"
+               "      return (c);\n"
+               "    }\n"
+               "    return (d);\n"
+               "  };\n"
+               "  if (e) {\n"
+               "    return e;\n"
+               "  }\n"
+               "  return f;\n"
+               "}",
+               "auto g() {\n"
+               "  decltype(auto) x = [] {\n"
+               "    auto y = [] {\n"
+               "      if (a) {\n"
+               "        return ((a));\n"
+               "      }\n"
+               "      return ((b));\n"
+               "    };\n"
+               "    if (c) {\n"
+               "      return ((c));\n"
+               "    }\n"
+               "    return ((d));\n"
+               "  };\n"
+               "  if (e) {\n"
+               "    return ((e));\n"
+               "  }\n"
+               "  return ((f));\n"
+               "}",
+               Style);
 
   Style.ColumnLimit = 25;
   verifyFormat("return (a + b) - (c + d);",
diff --git a/clang/unittests/Format/FormatTestMacroExpansion.cpp b/clang/unittests/Format/FormatTestMacroExpansion.cpp
index 1ac5ac0d84f125..bc698c60c107d1 100644
--- a/clang/unittests/Format/FormatTestMacroExpansion.cpp
+++ b/clang/unittests/Format/FormatTestMacroExpansion.cpp
@@ -47,10 +47,9 @@ TEST_F(FormatTestMacroExpansion, UnexpandConfiguredMacros) {
     { ID(a *b); });
 )",
                Style);
-  verifyIncompleteFormat(R"(ID3({, ID(a *b),
-  ;
-  });
-)",
+  verifyIncompleteFormat("ID3({, ID(a *b),\n"
+                         "  ;\n"
+                         "  });",
                          Style);
 
   verifyFormat("ID(CALL(CALL(return a * b;)));", Style);
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index cf66076c86216a..62ec460eba7fd9 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1631,65 +1631,77 @@ TEST_F(TokenAnnotatorTest, UnderstandsFunctionDeclarationNames) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::identifier, TT_FunctionDeclarationName);
 
-  Tokens = annotate("class Foo { public: Foo(); };");
+  Tokens = annotate("#define FOO Foo::\n"
+                    "FOO Foo();");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_FunctionDeclarationName);
+
+  Tokens = annotate("struct Foo {\n"
+                    "  Bar (*func)();\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[4], tok::l_paren, TT_FunctionTypeLParen);
+}
+
+TEST_F(TokenAnnotatorTest, UnderstandsCtorAndDtorDeclNames) {
+  auto Tokens = annotate("class Foo { public: Foo(); };");
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
-  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_CtorDtorDeclName);
 
   Tokens = annotate("class Foo { public: ~Foo(); };");
   ASSERT_EQ(Tokens.size(), 13u) << Tokens;
-  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_CtorDtorDeclName);
 
   Tokens = annotate("struct Foo { [[deprecated]] Foo() {} };");
   ASSERT_EQ(Tokens.size(), 16u) << Tokens;
-  EXPECT_TOKEN(Tokens[8], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[8], tok::identifier, TT_CtorDtorDeclName);
   EXPECT_TOKEN(Tokens[11], tok::l_brace, TT_FunctionLBrace);
 
   Tokens = annotate("struct Foo { [[deprecated]] ~Foo() {} };");
   ASSERT_EQ(Tokens.size(), 17u) << Tokens;
-  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_CtorDtorDeclName);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_FunctionLBrace);
 
   Tokens = annotate("struct Foo { Foo() [[deprecated]] {} };");
   ASSERT_EQ(Tokens.size(), 16u) << Tokens;
-  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_CtorDtorDeclName);
   EXPECT_TOKEN(Tokens[11], tok::l_brace, TT_FunctionLBrace);
 
   Tokens = annotate("struct Foo { ~Foo() [[deprecated]] {} };");
   ASSERT_EQ(Tokens.size(), 17u) << Tokens;
-  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_CtorDtorDeclName);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_FunctionLBrace);
 
   Tokens = annotate("struct Foo { [[deprecated]] explicit Foo() {} };");
   ASSERT_EQ(Tokens.size(), 17u) << Tokens;
-  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[9], tok::identifier, TT_CtorDtorDeclName);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_FunctionLBrace);
 
   Tokens = annotate("struct Foo { virtual [[deprecated]] ~Foo() {} };");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
-  EXPECT_TOKEN(Tokens[10], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[10], tok::identifier, TT_CtorDtorDeclName);
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_FunctionLBrace);
 
   Tokens = annotate("Foo::Foo() {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
-  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_CtorDtorDeclName);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_FunctionLBrace);
 
   Tokens = annotate("Foo::~Foo() {}");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
-  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_CtorDtorDeclName);
   EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_FunctionLBrace);
 
-  Tokens = annotate("#define FOO Foo::\n"
-                    "FOO Foo();");
-  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
-  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_FunctionDeclarationName);
-
-  Tokens = annotate("struct Foo {\n"
-                    "  Bar (*func)();\n"
+  Tokens = annotate("struct Test {\n"
+                    "  Test()\n"
+                    "      : l([] {\n"
+                    "          Short::foo();\n"
+                    "        }) {}\n"
                     "};");
-  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
-  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
-  EXPECT_TOKEN(Tokens[4], tok::l_paren, TT_FunctionTypeLParen);
+  ASSERT_EQ(Tokens.size(), 25u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_CtorDtorDeclName);
+  EXPECT_TOKEN(Tokens[14], tok::identifier, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsC11GenericSelection) {
@@ -1759,6 +1771,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsTrailingReturnArrow) {
   ASSERT_EQ(Tokens.size(), 16u) << Tokens;
   EXPECT_TOKEN(Tokens[11], tok::arrow, TT_Unknown);
 
+  Tokens = annotate("#define P(ptr) auto p = (ptr)->p");
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[12], tok::arrow, TT_Unknown);
+
   // Mixed
   Tokens = annotate("auto f() -> int { auto a = b()->c; }");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
diff --git a/clang/unittests/Interpreter/ExceptionTests/InterpreterExceptionTest.cpp b/clang/unittests/Interpreter/ExceptionTests/InterpreterExceptionTest.cpp
index 2f1c4efb381f00..7b47d93446192b 100644
--- a/clang/unittests/Interpreter/ExceptionTests/InterpreterExceptionTest.cpp
+++ b/clang/unittests/Interpreter/ExceptionTests/InterpreterExceptionTest.cpp
@@ -122,6 +122,11 @@ extern "C" int throw_exception() {
                               Triple.getArch() == llvm::Triple::aarch64_32))
     GTEST_SKIP();
 
+  // FIXME: RISC-V fails as .eh_frame handling is not yet implemented in
+  // JITLink for RISC-V. See PR #66067.
+  if (Triple.isRISCV())
+    GTEST_SKIP();
+
   llvm::cantFail(Interp->ParseAndExecute(ExceptionCode));
   testing::internal::CaptureStdout();
   auto ThrowException =
diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp
index 8932265674a59a..47aa2c131a304d 100644
--- a/clang/unittests/Lex/LexerTest.cpp
+++ b/clang/unittests/Lex/LexerTest.cpp
@@ -74,13 +74,7 @@ class LexerTest : public ::testing::Test {
     PP = CreatePP(Source, ModLoader);
 
     std::vector<Token> toks;
-    while (1) {
-      Token tok;
-      PP->Lex(tok);
-      if (tok.is(tok::eof))
-        break;
-      toks.push_back(tok);
-    }
+    PP->LexTokensUntilEOF(&toks);
 
     return toks;
   }
@@ -628,12 +622,7 @@ TEST_F(LexerTest, FindNextToken) {
 TEST_F(LexerTest, CreatedFIDCountForPredefinedBuffer) {
   TrivialModuleLoader ModLoader;
   auto PP = CreatePP("", ModLoader);
-  while (1) {
-    Token tok;
-    PP->Lex(tok);
-    if (tok.is(tok::eof))
-      break;
-  }
+  PP->LexTokensUntilEOF();
   EXPECT_EQ(SourceMgr.getNumCreatedFIDsForFileID(PP->getPredefinesFileID()),
             1U);
 }
diff --git a/clang/unittests/Lex/ModuleDeclStateTest.cpp b/clang/unittests/Lex/ModuleDeclStateTest.cpp
index ed694384da57cf..15306ba22bf67e 100644
--- a/clang/unittests/Lex/ModuleDeclStateTest.cpp
+++ b/clang/unittests/Lex/ModuleDeclStateTest.cpp
@@ -90,12 +90,7 @@ class ModuleDeclStateTest : public ::testing::Test {
     PP.addPPCallbacks(std::move(C));
     PP.EnterMainSourceFile();
 
-    while (1) {
-      Token tok;
-      PP.Lex(tok);
-      if (tok.is(tok::eof))
-        break;
-    }
+    PP.LexTokensUntilEOF();
   }
 
   FileSystemOptions FileMgrOpts;
diff --git a/clang/unittests/Lex/PPCallbacksTest.cpp b/clang/unittests/Lex/PPCallbacksTest.cpp
index b2be40ef3812e3..e0a27b5111821b 100644
--- a/clang/unittests/Lex/PPCallbacksTest.cpp
+++ b/clang/unittests/Lex/PPCallbacksTest.cpp
@@ -229,13 +229,7 @@ class PPCallbacksTest : public ::testing::Test {
 
     // Lex source text.
     PP.EnterMainSourceFile();
-
-    while (true) {
-      Token Tok;
-      PP.Lex(Tok);
-      if (Tok.is(tok::eof))
-        break;
-    }
+    PP.LexTokensUntilEOF();
 
     // Callbacks have been executed at this point -- return filename range.
     return Callbacks;
@@ -259,13 +253,7 @@ class PPCallbacksTest : public ::testing::Test {
 
     // Lex source text.
     PP.EnterMainSourceFile();
-
-    while (true) {
-      Token Tok;
-      PP.Lex(Tok);
-      if (Tok.is(tok::eof))
-        break;
-    }
+    PP.LexTokensUntilEOF();
 
     return Callbacks->Results;
   }
@@ -290,12 +278,7 @@ class PPCallbacksTest : public ::testing::Test {
 
     // Lex source text.
     PP.EnterMainSourceFile();
-    while (true) {
-      Token Tok;
-      PP.Lex(Tok);
-      if (Tok.is(tok::eof))
-        break;
-    }
+    PP.LexTokensUntilEOF();
 
     return Callbacks->Marks;
   }
@@ -334,12 +317,7 @@ class PPCallbacksTest : public ::testing::Test {
 
     // Lex source text.
     PP.EnterMainSourceFile();
-    while (true) {
-      Token Tok;
-      PP.Lex(Tok);
-      if (Tok.is(tok::eof))
-        break;
-    }
+    PP.LexTokensUntilEOF();
 
     PragmaOpenCLExtensionCallbacks::CallbackParameters RetVal = {
       Callbacks->Name,
@@ -477,12 +455,7 @@ TEST_F(PPCallbacksTest, FileNotFoundSkipped) {
 
   // Lex source text.
   PP.EnterMainSourceFile();
-  while (true) {
-    Token Tok;
-    PP.Lex(Tok);
-    if (Tok.is(tok::eof))
-      break;
-  }
+  PP.LexTokensUntilEOF();
 
   ASSERT_EQ(1u, Callbacks->NumCalls);
   ASSERT_EQ(0u, DiagConsumer->getNumErrors());
diff --git a/clang/unittests/Lex/PPConditionalDirectiveRecordTest.cpp b/clang/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
index ba756395786518..84c4cc3a1b2dc0 100644
--- a/clang/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
+++ b/clang/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
@@ -87,13 +87,7 @@ TEST_F(PPConditionalDirectiveRecordTest, PPRecAPI) {
   PP.EnterMainSourceFile();
 
   std::vector<Token> toks;
-  while (1) {
-    Token tok;
-    PP.Lex(tok);
-    if (tok.is(tok::eof))
-      break;
-    toks.push_back(tok);
-  }
+  PP.LexTokensUntilEOF(&toks);
 
   // Make sure we got the tokens that we expected.
   ASSERT_EQ(10U, toks.size());
diff --git a/clang/unittests/Lex/PPDependencyDirectivesTest.cpp b/clang/unittests/Lex/PPDependencyDirectivesTest.cpp
index 4685021b6ecdba..6ff87f720a559f 100644
--- a/clang/unittests/Lex/PPDependencyDirectivesTest.cpp
+++ b/clang/unittests/Lex/PPDependencyDirectivesTest.cpp
@@ -133,12 +133,7 @@ TEST_F(PPDependencyDirectivesTest, MacroGuard) {
   SmallVector<StringRef> IncludedFiles;
   PP.addPPCallbacks(std::make_unique<IncludeCollector>(PP, IncludedFiles));
   PP.EnterMainSourceFile();
-  while (true) {
-    Token tok;
-    PP.Lex(tok);
-    if (tok.is(tok::eof))
-      break;
-  }
+  PP.LexTokensUntilEOF();
 
   SmallVector<std::string> IncludedFilesSlash;
   for (StringRef IncludedFile : IncludedFiles)
diff --git a/clang/unittests/Lex/PPMemoryAllocationsTest.cpp b/clang/unittests/Lex/PPMemoryAllocationsTest.cpp
index 894d94e149a9ff..f7fb097b1f7faf 100644
--- a/clang/unittests/Lex/PPMemoryAllocationsTest.cpp
+++ b/clang/unittests/Lex/PPMemoryAllocationsTest.cpp
@@ -75,12 +75,7 @@ TEST_F(PPMemoryAllocationsTest, PPMacroDefinesAllocations) {
   PP.Initialize(*Target);
   PP.EnterMainSourceFile();
 
-  while (1) {
-    Token tok;
-    PP.Lex(tok);
-    if (tok.is(tok::eof))
-      break;
-  }
+  PP.LexTokensUntilEOF();
 
   size_t NumAllocated = PP.getPreprocessorAllocator().getBytesAllocated();
   float BytesPerDefine = float(NumAllocated) / float(NumMacros);
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index fdfbbfe4e3a9df..a7ca2bf91e474e 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -190,6 +190,7 @@ Frontend
 | EvaluateAsInitializer (slow_value)
 | EvaluateAsConstantExpr (<test.cc:17:33, col:59>)
 | EvaluateAsConstantExpr (<test.cc:18:11, col:37>)
+| EvaluateAsConstantExpr (<test.cc:23:31, col:57>)
 | EvaluateAsRValue (<test.cc:22:14, line:23:58>)
 | EvaluateAsInitializer (slow_init_list)
 | PerformPendingInstantiations
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index ffada02ac4d30a..f2a6c1dfcf2fcc 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -2690,7 +2690,8 @@ static void emitAttributes(RecordKeeper &Records, raw_ostream &OS,
         OS << ", ";
         emitFormInitializer(OS, Spellings[0], "0");
       } else {
-        OS << ", (\n";
+        OS << ", [&]() {\n";
+        OS << "    switch (S) {\n";
         std::set<std::string> Uniques;
         unsigned Idx = 0;
         for (auto I = Spellings.begin(), E = Spellings.end(); I != E;
@@ -2698,15 +2699,19 @@ static void emitAttributes(RecordKeeper &Records, raw_ostream &OS,
           const FlattenedSpelling &S = *I;
           const auto &Name = SemanticToSyntacticMap[Idx];
           if (Uniques.insert(Name).second) {
-            OS << "    S == " << Name << " ? AttributeCommonInfo::Form";
+            OS << "    case " << Name << ":\n";
+            OS << "      return AttributeCommonInfo::Form";
             emitFormInitializer(OS, S, Name);
-            OS << " :\n";
+            OS << ";\n";
           }
         }
-        OS << "    (llvm_unreachable(\"Unknown attribute spelling!\"), "
-           << " AttributeCommonInfo::Form";
+        OS << "    default:\n";
+        OS << "      llvm_unreachable(\"Unknown attribute spelling!\");\n"
+           << "      return AttributeCommonInfo::Form";
         emitFormInitializer(OS, Spellings[0], "0");
-        OS << "))";
+        OS << ";\n"
+           << "    }\n"
+           << "  }()";
       }
 
       OS << ");\n";
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 99e5c0109d4ffc..13bb1971c61deb 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -1186,12 +1186,12 @@ <h2 id="c2x">C23 implementation status</h2>
     <tr>
       <td>Underspecified object definitions</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3006.htm">N3006</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="none" align="center">No</td>
     </tr>
     <tr>
       <td>Type inference for object declarations</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3007.htm">N3007</a></td>
-      <td class="none" align="center">No</td>
+      <td class="unreleased" align="center">Clang 18</td>
     </tr>
     <tr>
       <td>constexpr for object definitions</td>
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index ee9712e9bab994..e711b2173d191c 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -7145,7 +7145,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1223.html">1223</a></td>
     <td>drafting</td>
     <td>Syntactic disambiguation and <I>trailing-return-type</I>s</td>
-    <td align="center">Not resolved</td>
+    <td class="unreleased" align="center">Clang 17</td>
   </tr>
   <tr id="1224">
     <td><a href="https://cplusplus.github.io/CWG/issues/1224.html">1224</a></td>
@@ -7853,7 +7853,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1341.html">1341</a></td>
     <td>NAD</td>
     <td>Bit-field initializers</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="na" align="center">Superseded by <a href="https://wg21.link/P0683R1">P0683R1</a></td>
   </tr>
   <tr id="1342">
     <td><a href="https://cplusplus.github.io/CWG/issues/1342.html">1342</a></td>
@@ -12653,7 +12653,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2141.html">2141</a></td>
     <td>CD4</td>
     <td>Ambiguity in <I>new-expression</I> with <I>elaborated-type-specifier</I></td>
-    <td class="unreleased" align="center">Clang 17</td>
+    <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2142">
     <td><a href="https://cplusplus.github.io/CWG/issues/2142.html">2142</a></td>
@@ -13409,7 +13409,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2267.html">2267</a></td>
     <td>CD5</td>
     <td>Copy-initialization of temporary in reference direct-initialization</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="none" align="center">No</td>
   </tr>
   <tr id="2268">
     <td><a href="https://cplusplus.github.io/CWG/issues/2268.html">2268</a></td>
@@ -14189,7 +14189,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2397.html">2397</a></td>
     <td>CD6</td>
     <td><TT>auto</TT> specifier for pointers and references to arrays</td>
-    <td class="unreleased" align="center">Clang 17</td>
+    <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2398">
     <td><a href="https://cplusplus.github.io/CWG/issues/2398.html">2398</a></td>
@@ -14915,7 +14915,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2518.html">2518</a></td>
     <td>C++23</td>
     <td>Conformance requirements and <TT>#error</TT>/<TT>#warning</TT></td>
-    <td class="unreleased" align="center">Clang 17</td>
+    <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2519">
     <td><a href="https://cplusplus.github.io/CWG/issues/2519.html">2519</a></td>
@@ -14933,7 +14933,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2521.html">2521</a></td>
     <td>C++23</td>
     <td>User-defined literals and reserved identifiers</td>
-    <td class="unreleased" align="center">Clang 17</td>
+    <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2522">
     <td><a href="https://cplusplus.github.io/CWG/issues/2522.html">2522</a></td>
@@ -15125,13 +15125,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2553.html">2553</a></td>
     <td>ready</td>
     <td>Restrictions on explicit object member functions</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2554">
     <td><a href="https://cplusplus.github.io/CWG/issues/2554.html">2554</a></td>
     <td>review</td>
     <td>Overriding virtual functions, also with explicit object parameters</td>
-    <td align="center">Not resolved</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2555">
     <td><a href="https://cplusplus.github.io/CWG/issues/2555.html">2555</a></td>
@@ -15173,7 +15173,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
     <td>review</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
-    <td align="center">Not resolved</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2562">
     <td><a href="https://cplusplus.github.io/CWG/issues/2562.html">2562</a></td>
@@ -15725,7 +15725,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2653.html">2653</a></td>
     <td>C++23</td>
     <td>Can an explicit object parameter have a default argument?</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr id="2654">
     <td><a href="https://cplusplus.github.io/CWG/issues/2654.html">2654</a></td>
@@ -15893,7 +15893,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2681.html">2681</a></td>
     <td>C++23</td>
     <td>Deducing member array type from string literal</td>
-    <td class="unreleased" align="center">Clang 17</td>
+    <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2682">
     <td><a href="https://cplusplus.github.io/CWG/issues/2682.html">2682</a></td>
@@ -15929,7 +15929,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2687.html">2687</a></td>
     <td>C++23</td>
     <td>Calling an explicit object member function via an address-of-overload-set</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2688">
     <td><a href="https://cplusplus.github.io/CWG/issues/2688.html">2688</a></td>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 41acb47b40d878..e2cf9ab2546521 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -222,7 +222,7 @@ <h2 id="cxx23">C++23 implementation status</h2>
     <tr>
       <td rowspan=2>Deducing this</td>
       <td><a href="https://wg21.link/P0847R7">P0847R7</a></td>
-      <td class="none" align="center">No</td>
+      <td class="unreleased" align="center">Clang 18</td>
     </tr>
     <tr>
       <td><a href="https://wg21.link/P2797R0">P2797R0</a></td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index afb7189c0ececc..0c1f9c73eff1c8 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -39,7 +39,11 @@ def collect_tests():
     test_cpp = os.path.join(dr_test_dir, test_cpp)
     found_any = False;
     for match in re.finditer(status_re, open(test_cpp, 'r').read()):
-      status_map[int(match.group(1))] = match.group(2)
+      dr_number = int(match.group(1))
+      if dr_number in status_map:
+        print("error: Comment for dr{} encountered more than once. Duplicate found in {}".format(dr_number, test_cpp))
+        sys.exit(1)
+      status_map[dr_number] = match.group(2)
       found_any = True
     if not found_any:
       print("warning:%s: no '// dr123: foo' comments in this file" % test_cpp, file=sys.stderr)
@@ -122,7 +126,7 @@ out_file.write('''\
     <th>Available in Clang?</th>
   </tr>''')
 
-latest_release = 16
+latest_release = 17
 
 def availability(issue):
   status = status_map.get(issue, 'unknown')
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index cc3190bd7f761e..1a46f5b3348069 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -5,7 +5,9 @@
 
 cmake_minimum_required(VERSION 3.20.0)
 
-set(LLVM_COMMON_CMAKE_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
+if(NOT DEFINED LLVM_COMMON_CMAKE_UTILS)
+  set(LLVM_COMMON_CMAKE_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
+endif()
 include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
   NO_POLICY_SCOPE)
 
@@ -375,6 +377,25 @@ endif()
 if(MSVC)
   # FIXME: In fact, sanitizers should support both /MT and /MD, see PR20214.
   set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
+
+  # Remove any /M[DT][d] flags, and strip any definitions of _DEBUG.
+  # Since we're using CMAKE_MSVC_RUNTIME_LIBRARY (CMP0091 set to NEW),
+  # these options shouldn't be included in these flags variables. However,
+  # package managers that don't know which mechanism is used for passing
+  # CRT choice flags might be passing them both ways - which leads to
+  # duplicate CRT choice options. Thus make sure to strip out these flags
+  # from these variables, when we're forcing a CRT choice other than what
+  # the user requested here.
+  foreach(flag_var
+    CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+    CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
+    CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+    CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    string(REGEX REPLACE "[/-]M[DT]d" "" ${flag_var} "${${flag_var}}")
+    string(REGEX REPLACE "[/-]MD" "" ${flag_var} "${${flag_var}}")
+    string(REGEX REPLACE "[/-]D_DEBUG" "" ${flag_var} "${${flag_var}}")
+  endforeach()
+
   append_list_if(COMPILER_RT_HAS_Oy_FLAG /Oy- SANITIZER_COMMON_CFLAGS)
   append_list_if(COMPILER_RT_HAS_GS_FLAG /GS- SANITIZER_COMMON_CFLAGS)
 
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 09a9b62ce4cd37..a8e078f1ebc988 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -63,6 +63,16 @@ if (C_SUPPORTS_NODEFAULTLIBS_FLAG)
                         moldname mingwex msvcrt)
     list(APPEND CMAKE_REQUIRED_LIBRARIES ${MINGW_LIBRARIES})
   endif()
+  if (NOT TARGET unwind)
+    # Don't check for a library named unwind, if there's a target with that name within
+    # the same build.
+    check_library_exists(unwind _Unwind_GetRegionStart "" COMPILER_RT_HAS_LIBUNWIND)
+    if (COMPILER_RT_HAS_LIBUNWIND)
+      # If we're omitting default libraries, we might need to manually link in libunwind.
+      # This can affect whether we detect a statically linked libc++ correctly.
+      list(APPEND CMAKE_REQUIRED_LIBRARIES unwind)
+    endif()
+  endif()
 endif ()
 
 # CodeGen options.
diff --git a/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/compiler-rt/include/fuzzer/FuzzedDataProvider.h
index 8a8214bd99fef3..5903ed837917ca 100644
--- a/compiler-rt/include/fuzzer/FuzzedDataProvider.h
+++ b/compiler-rt/include/fuzzer/FuzzedDataProvider.h
@@ -158,7 +158,7 @@ FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) {
   // picking its contents.
   std::string result;
 
-  // Reserve the anticipated capaticity to prevent several reallocations.
+  // Reserve the anticipated capacity to prevent several reallocations.
   result.reserve(std::min(max_length, remaining_bytes_));
   for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) {
     char next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 4456bf1ab17632..8ba7e186d4fb1a 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -644,7 +644,6 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
        (uint64_t)'p' << 40 | (uint64_t)'r' << 32 | (uint64_t)'o' << 24 |  \
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
-/* FIXME: Please remedy the fixme in the header before bumping the version. */
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
@@ -652,9 +651,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
-/* Profile version is always of type uint64_t. Reserve the upper 8 bits in the
- * version for other variants of profile. We set the lowest bit of the upper 8
- * bits (i.e. bit 56) to 1 to indicate if this is an IR-level instrumentation
+/* Profile version is always of type uint64_t. Reserve the upper 32 bits in the
+ * version for other variants of profile. We set the 8th most significant bit 
+ * (i.e. bit 56) to 1 to indicate if this is an IR-level instrumentation
  * generated profile, and 0 if this is a Clang FE generated profile.
  * 1 in bit 57 indicates there are context-sensitive records in the profile.
  * The 59th bit indicates whether to use debug info to correlate profiles.
@@ -663,7 +662,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 62nd bit indicates whether memory profile information is present.
  * The 63rd bit indicates if this is a temporal profile.
  */
-#define VARIANT_MASKS_ALL 0xff00000000000000ULL
+#define VARIANT_MASKS_ALL 0xffffffff00000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
 #define VARIANT_MASK_IR_PROF (0x1ULL << 56)
 #define VARIANT_MASK_CSIR_PROF (0x1ULL << 57)
diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt
index bc5fe0be120f8d..48d0e91052d73b 100644
--- a/compiler-rt/lib/asan/CMakeLists.txt
+++ b/compiler-rt/lib/asan/CMakeLists.txt
@@ -92,7 +92,7 @@ append_rtti_flag(OFF ASAN_CFLAGS)
 
 # Silence warnings in system headers with MSVC.
 if(NOT CLANG_CL)
-  append_list_if(COMPILER_RT_HAS_EXTERNAL_FLAG "/experimental:external /external:W0 /external:anglebrackets" ASAN_CFLAGS)
+  append_list_if(COMPILER_RT_HAS_EXTERNAL_FLAG "/experimental:external;/external:W0;/external:anglebrackets" ASAN_CFLAGS)
 endif()
 
 # Too many existing bugs, needs cleanup.
diff --git a/compiler-rt/lib/asan/asan_poisoning.cpp b/compiler-rt/lib/asan/asan_poisoning.cpp
index 6bfc79ecb34b9c..746ad61813c65d 100644
--- a/compiler-rt/lib/asan/asan_poisoning.cpp
+++ b/compiler-rt/lib/asan/asan_poisoning.cpp
@@ -446,8 +446,11 @@ void __sanitizer_annotate_contiguous_container(const void *beg_p,
   // https://github.com/google/sanitizers/issues/258.
   // if (d1 != d2)
   //  DCHECK_EQ(*(u8*)MemToShadow(d1), old_mid - d1);
-  if (a + granularity <= d1)
+  //
+  // NOTE: curly brackets for the "if" below to silence a MSVC warning.
+  if (a + granularity <= d1) {
     DCHECK_EQ(*(u8 *)MemToShadow(a), 0);
+  }
   // if (d2 + granularity <= c && c <= end)
   //   DCHECK_EQ(*(u8 *)MemToShadow(c - granularity),
   //            kAsanContiguousContainerOOBMagic);
diff --git a/compiler-rt/lib/asan/asan_thread.h b/compiler-rt/lib/asan/asan_thread.h
index 77f61a3cacc50e..62f1b5337fe4bf 100644
--- a/compiler-rt/lib/asan/asan_thread.h
+++ b/compiler-rt/lib/asan/asan_thread.h
@@ -56,6 +56,13 @@ class AsanThreadContext final : public ThreadContextBase {
 // AsanThreadContext objects are never freed, so we need many of them.
 COMPILER_CHECK(sizeof(AsanThreadContext) <= 256);
 
+#if defined(_MSC_VER) && !defined(__clang__)
+// MSVC raises a warning about a nonstandard extension being used for the 0
+// sized element in this array. Disable this for warn-as-error builds.
+#  pragma warning(push)
+#  pragma warning(disable : 4200)
+#endif
+
 // AsanThread are stored in TSD and destroyed when the thread dies.
 class AsanThread {
  public:
@@ -185,6 +192,10 @@ class AsanThread {
   char start_data_[];
 };
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#  pragma warning(pop)
+#endif
+
 // Returns a single instance of registry.
 ThreadRegistry &asanThreadRegistry();
 ThreadArgRetval &asanThreadArgRetval();
diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c
index 329dd8e6ae92db..e413a78817766c 100644
--- a/compiler-rt/lib/builtins/cpu_model.c
+++ b/compiler-rt/lib/builtins/cpu_model.c
@@ -1229,7 +1229,11 @@ enum CPUFeatures {
   FEAT_SME_F64,
   FEAT_SME_I64,
   FEAT_SME2,
-  FEAT_MAX
+  FEAT_RCPC3,
+  FEAT_MAX,
+  FEAT_EXT = 62, // Reserved to indicate presence of additional features field
+                 // in __aarch64_cpu_features
+  FEAT_INIT      // Used as flag of features initialization completion
 };
 
 // Architecture features used
@@ -1388,6 +1392,9 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     // ID_AA64ISAR1_EL1.LRCPC != 0b0000
     if (extractBits(ftr, 20, 4) != 0x0)
       setCPUFeature(FEAT_RCPC);
+    // ID_AA64ISAR1_EL1.LRCPC == 0b0011
+    if (extractBits(ftr, 20, 4) == 0x3)
+      setCPUFeature(FEAT_RCPC3);
     // ID_AA64ISAR1_EL1.SPECRES == 0b0001
     if (extractBits(ftr, 40, 4) == 0x2)
       setCPUFeature(FEAT_PREDRES);
@@ -1423,7 +1430,7 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     if (hwcap & HWCAP_SHA3)
       setCPUFeature(FEAT_SHA3);
   }
-  setCPUFeature(FEAT_MAX);
+  setCPUFeature(FEAT_INIT);
 }
 
 void __init_cpu_features_resolver(unsigned long hwcap,
diff --git a/compiler-rt/lib/interception/CMakeLists.txt b/compiler-rt/lib/interception/CMakeLists.txt
index 3242cf50e35f86..abe9229340be7a 100644
--- a/compiler-rt/lib/interception/CMakeLists.txt
+++ b/compiler-rt/lib/interception/CMakeLists.txt
@@ -21,7 +21,7 @@ append_rtti_flag(OFF INTERCEPTION_CFLAGS)
 
 # Silence warnings in system headers with MSVC.
 if(NOT CLANG_CL)
-  append_list_if(COMPILER_RT_HAS_EXTERNAL_FLAG "/experimental:external /external:W0 /external:anglebrackets" INTERCEPTION_CFLAGS)
+  append_list_if(COMPILER_RT_HAS_EXTERNAL_FLAG "/experimental:external;/external:W0;/external:anglebrackets" INTERCEPTION_CFLAGS)
 endif()
 
 add_compiler_rt_object_libraries(RTInterception
diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index b2ba40902347f4..23d26ea94b51a6 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -1,4 +1,4 @@
-//===-- interception_linux.cpp ----------------------------------*- C++ -*-===//
+//===-- interception_win.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h
index fa43ac50c61e4f..d77bc05b780203 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h
@@ -636,7 +636,8 @@ class SizeClassAllocator64 {
   }
   uptr SpaceEnd() const { return  SpaceBeg() + kSpaceSize; }
   // kRegionSize should be able to satisfy the largest size class.
-  static_assert(kRegionSize >= SizeClassMap::kMaxSize);
+  static_assert(kRegionSize >= SizeClassMap::kMaxSize,
+                "Region size exceed largest size");
   // kRegionSize must be <= 2^36, see CompactPtrT.
   COMPILER_CHECK((kRegionSize) <= (1ULL << (SANITIZER_WORDSIZE / 2 + 4)));
   // Call mmap for user memory with at least this size.
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index e4ba6ee7886f24..b013f03a73ae40 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -245,12 +245,14 @@ class Allocator {
   // - unlinking the local stats from the global ones (destroying the cache does
   //   the last two items).
   void commitBack(TSD<ThisT> *TSD) {
+    TSD->assertLocked(/*BypassCheck=*/true);
     Quarantine.drain(&TSD->getQuarantineCache(),
                      QuarantineCallback(*this, TSD->getCache()));
     TSD->getCache().destroy(&Stats);
   }
 
   void drainCache(TSD<ThisT> *TSD) {
+    TSD->assertLocked(/*BypassCheck=*/true);
     Quarantine.drainAndRecycle(&TSD->getQuarantineCache(),
                                QuarantineCallback(*this, TSD->getCache()));
     TSD->getCache().drain();
@@ -363,6 +365,7 @@ class Allocator {
       DCHECK_NE(ClassId, 0U);
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+      TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
       Block = TSD->getCache().allocate(ClassId);
       // If the allocation failed, the most likely reason with a 32-bit primary
       // is the region being full. In that event, retry in each successively
@@ -1147,6 +1150,7 @@ class Allocator {
       if (LIKELY(ClassId)) {
         bool UnlockRequired;
         auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+        TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
         const bool CacheDrained =
             TSD->getCache().deallocate(ClassId, BlockBegin);
         if (UnlockRequired)
@@ -1166,6 +1170,7 @@ class Allocator {
     } else {
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+      TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
       Quarantine.put(&TSD->getQuarantineCache(),
                      QuarantineCallback(*this, TSD->getCache()), Ptr, Size);
       if (UnlockRequired)
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index d0890d1c5e2d3d..c89e6a95f5a68a 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -174,9 +174,9 @@ template <typename Config> class MapAllocatorCache {
       if (!Entry.isValid())
         continue;
       Str->append("StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, "
-                  "BlockSize: %zu\n",
+                  "BlockSize: %zu %s\n",
                   Entry.CommitBase, Entry.CommitBase + Entry.CommitSize,
-                  Entry.CommitSize);
+                  Entry.CommitSize, Entry.Time == 0 ? "[R]" : "");
     }
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 1143aaab8371d3..7958e9dabf60db 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -512,6 +512,7 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, CacheDrain) NO_THREAD_SAFETY_ANALYSIS {
 
   bool UnlockRequired;
   auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
+  TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
   EXPECT_TRUE(!TSD->getCache().isEmpty());
   TSD->getCache().drain();
   EXPECT_TRUE(TSD->getCache().isEmpty());
@@ -536,6 +537,7 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ForceCacheDrain) NO_THREAD_SAFETY_ANALYSIS {
 
   bool UnlockRequired;
   auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
+  TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
   EXPECT_TRUE(TSD->getCache().isEmpty());
   EXPECT_EQ(TSD->getQuarantineCache().getSize(), 0U);
   EXPECT_TRUE(Allocator->getQuarantine()->isEmpty());
@@ -842,6 +844,7 @@ TEST(ScudoCombinedTest, BasicTrustyConfig) {
 
   bool UnlockRequired;
   auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
+  TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
   TSD->getCache().drain();
 
   Allocator->releaseToOS(scudo::ReleaseToOS::Force);
diff --git a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
index 5953d759a69a97..fad8fcf9007711 100644
--- a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
@@ -101,6 +101,7 @@ template <class AllocatorT> static void testRegistry() {
 
   bool UnlockRequired;
   auto TSD = Registry->getTSDAndLock(&UnlockRequired);
+  TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
   EXPECT_NE(TSD, nullptr);
   EXPECT_EQ(TSD->getCache().Canary, 0U);
   if (UnlockRequired)
@@ -108,6 +109,7 @@ template <class AllocatorT> static void testRegistry() {
 
   Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/false);
   TSD = Registry->getTSDAndLock(&UnlockRequired);
+  TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
   EXPECT_NE(TSD, nullptr);
   EXPECT_EQ(TSD->getCache().Canary, 0U);
   memset(&TSD->getCache(), 0x42, sizeof(TSD->getCache()));
@@ -137,6 +139,7 @@ template <typename AllocatorT> static void stressCache(AllocatorT *Allocator) {
   Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false);
   bool UnlockRequired;
   auto TSD = Registry->getTSDAndLock(&UnlockRequired);
+  TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
   EXPECT_NE(TSD, nullptr);
   // For an exclusive TSD, the cache should be empty. We cannot guarantee the
   // same for a shared TSD.
@@ -195,6 +198,7 @@ static void stressSharedRegistry(MockAllocator<SharedCaches> *Allocator) {
   bool UnlockRequired;
   for (scudo::uptr I = 0; I < 4096U; I++) {
     auto TSD = Registry->getTSDAndLock(&UnlockRequired);
+    TSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
     EXPECT_NE(TSD, nullptr);
     Set.insert(reinterpret_cast<void *>(TSD));
     if (UnlockRequired)
diff --git a/compiler-rt/lib/scudo/standalone/tsd.h b/compiler-rt/lib/scudo/standalone/tsd.h
index f4fa545de5e046..b2108a01900bcc 100644
--- a/compiler-rt/lib/scudo/standalone/tsd.h
+++ b/compiler-rt/lib/scudo/standalone/tsd.h
@@ -53,8 +53,14 @@ template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
   inline void unlock() NO_THREAD_SAFETY_ANALYSIS { Mutex.unlock(); }
   inline uptr getPrecedence() { return atomic_load_relaxed(&Precedence); }
 
-  void commitBack(Allocator *Instance) ASSERT_CAPABILITY(Mutex) {
-    Instance->commitBack(this);
+  void commitBack(Allocator *Instance) { Instance->commitBack(this); }
+
+  // As the comments attached to `getCache()`, the TSD doesn't always need to be
+  // locked. In that case, we would only skip the check before we have all TSDs
+  // locked in all paths.
+  void assertLocked(bool BypassCheck) ASSERT_CAPABILITY(Mutex) {
+    if (SCUDO_DEBUG && !BypassCheck)
+      Mutex.assertHeld();
   }
 
   // Ideally, we may want to assert that all the operations on
@@ -66,11 +72,8 @@ template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
   // TODO(chiahungduan): Ideally, we want to do `Mutex.assertHeld` but acquiring
   // TSD doesn't always require holding the lock. Add this assertion while the
   // lock is always acquired.
-  typename Allocator::CacheT &getCache() ASSERT_CAPABILITY(Mutex) {
-    return Cache;
-  }
-  typename Allocator::QuarantineCacheT &getQuarantineCache()
-      ASSERT_CAPABILITY(Mutex) {
+  typename Allocator::CacheT &getCache() REQUIRES(Mutex) { return Cache; }
+  typename Allocator::QuarantineCacheT &getQuarantineCache() REQUIRES(Mutex) {
     return QuarantineCache;
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
index dcb0948ad78faa..1bca578ee14be4 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -120,6 +120,11 @@ struct TSDRegistrySharedT {
                 TSDsArraySize);
     for (uptr I = 0; I < NumberOfTSDs; ++I) {
       TSDs[I].lock();
+      // Theoretically, we want to mark TSD::lock()/TSD::unlock() with proper
+      // thread annotations. However, given the TSD is only locked on shared
+      // path, do the assertion in a separate path to avoid confusing the
+      // analyzer.
+      TSDs[I].assertLocked(/*BypassCheck=*/true);
       Str->append("  Shared TSD[%zu]:\n", I);
       TSDs[I].getCache().getStats(Str);
       TSDs[I].unlock();
diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt
index 2031726afe0b59..3f1e12ed9ac66f 100644
--- a/compiler-rt/lib/ubsan/CMakeLists.txt
+++ b/compiler-rt/lib/ubsan/CMakeLists.txt
@@ -57,7 +57,7 @@ append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_CXXFLAGS)
 
 # Silence warnings in system headers with MSVC.
 if(NOT CLANG_CL)
-  append_list_if(COMPILER_RT_HAS_EXTERNAL_FLAG "/experimental:external /external:W0 /external:anglebrackets" UBSAN_CXXFLAGS)
+  append_list_if(COMPILER_RT_HAS_EXTERNAL_FLAG "/experimental:external;/external:W0;/external:anglebrackets" UBSAN_CXXFLAGS)
 endif()
 
 set(UBSAN_LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS})
diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt
deleted file mode 100644
index 053869b8769e21..00000000000000
--- a/flang-rt/CMakeLists.txt
+++ /dev/null
@@ -1,148 +0,0 @@
-# CMake build for the Flang runtime libraries
-# The source for the flang runtime libraries (FortranDecimalRT, FortranRuntime)
-# exist in the flang top-level directory.
-# Flang-rt is only scaffolding and does not provide any additional source files.
-
-cmake_minimum_required(VERSION 3.20.0)
-
-#===============================================================================
-# Configure CMake
-#===============================================================================
-set(LLVM_COMMON_CMAKE_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
-include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
-  NO_POLICY_SCOPE)
-
-set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
-
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/flang-rt/bin)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY
-  ${CMAKE_BINARY_DIR}/flang-rt/lib)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY
-  ${CMAKE_BINARY_DIR}/flang-rt/lib)
-set(CMAKE_CURRENT_BINARY_DIR ${CMAKE_BINARY_DIR}/flang-rt)
-
-set(FLANG_RT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
-set(FLANG_RT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-
-#===============================================================================
-# Setup Options and Defaults
-#===============================================================================
-option(FLANG_RT_ENABLE_SHARED "Build flang-rt as a shared library." OFF)
-option(FLANG_RT_ENABLE_STATIC "Build flang-rt as a static library." OFF)
-
-option(FLANG_RT_INCLUDE_TESTS
-    "Generate build targets for the Flang-rt unit tests." ${LLVM_INCLUDE_TESTS})
-
-# MLIR_DIR must be passed on invocation of flang-rt because it is needed for the Flang package.
-if(NOT DEFINED MLIR_DIR OR MLIR_DIR STREQUAL "")
-  message(FATAL_ERROR "MLIR_DIR must be set to the directory of the MLIRConfig cmake file in order to find the MLIR package.")
-endif()
-# Flang-rt requires a pre-built/installed version of flang that requires MLIR.
-find_package(MLIR REQUIRED HINTS "${MLIR_DIR}")
-
-# FLANG_DIR must be passed on invocation of flang-rt.
-if(NOT DEFINED FLANG_DIR OR FLANG_DIR STREQUAL "")
-  message(FATAL_ERROR "FLANG_DIR must be set to the directory of the FlangConfig cmake file in order to find the Flang package.")
-endif()
-# Flang-rt requires a pre-built/installed version of flang.
-# Flang-rt uses flang/Common headers.
-# Finding this package exposes FLANG_SOURCE_DIR, FLANG_BINARY_DIR, and FLANG_INCLUDE_DIRS to Flang-rt
-find_package(Flang REQUIRED HINTS "${FLANG_DIR}")
-# If the user specifies a relative path to LLVM_DIR, the calls to include
-# LLVM modules fail. Append the absolute path to LLVM_DIR instead.
-get_filename_component(FLANG_DIR_ABSOLUTE ${FLANG_DIR} REALPATH)
-list(APPEND CMAKE_MODULE_PATH ${FLANG_DIR_ABSOLUTE})
-
-set(LLVM_COMMON_CMAKE_UTILS "${FLANG_RT_SOURCE_DIR}/../cmake")
-set(LLVM_CMAKE_UTILS "${LLVM_BUILD_MAIN_SOURCE_DIR}/cmake")
-set(CLANG_CMAKE_UTILS "${FLANG_RT_SOURCE_DIR}/../clang/cmake")
-
-if (FLANG_RT_INCLUDE_TESTS)
-  # LLVM_DIR must be passed on invocation of flang-rt when tests are enabled.
-  if(NOT DEFINED LLVM_DIR OR LLVM_DIR STREQUAL "")
-    message(FATAL_ERROR "LLVM_DIR must be set to the directory of the LLVMConfig cmake file in order to find the LLVM package.")
-  endif()
-  # We need a pre-built/installed version of LLVM for gtest.
-  find_package(LLVM REQUIRED HINTS "${LLVM_DIR}")
-  # If the user specifies a relative path to LLVM_DIR, the calls to include
-  # LLVM modules fail. Append the absolute path to LLVM_DIR instead.
-  get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR} REALPATH)
-  list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE})
-
-  add_compile_definitions(FLANG_RT_INCLUDE_TESTS=1)
-
-  if (DEFINED FLANG_BINARY_DIR)
-    set(FLANG_BINARY_DIR ${FLANG_BINARY_DIR} CACHE PATH "Path to the Flang build directory" FORCE)
-  else()
-    message(FATAL_ERROR "FLANG_BINARY_DIR must be defined or passed by the user when building tests.")
-  endif()
-  set(FLANG_RT_TEST_COMPILER ${FLANG_BINARY_DIR}/bin/flang-new
-      CACHE PATH "Compiler to use for testing")
-  set(FLANG_RT_GTEST_AVAIL 1)
-endif()
-
-# Add path for custom modules
-list(INSERT CMAKE_MODULE_PATH 0
-  "${FLANG_SOURCE_DIR}/cmake"
-  "${FLANG_SOURCE_DIR}/cmake/modules"
-  "${CLANG_CMAKE_UTILS}/modules"
-  "${LLVM_CMAKE_UTILS}/modules"
-)
-
-include(AddClang)
-include(AddFlang)
-include(TestBigEndian)
-test_big_endian(IS_BIGENDIAN)
-if (IS_BIGENDIAN)
-  add_compile_definitions(FLANG_BIG_ENDIAN=1)
-else ()
-  add_compile_definitions(FLANG_LITTLE_ENDIAN=1)
-endif ()
-
-# Flang's include directories are needed for flang/Common.
-include_directories(SYSTEM ${FLANG_INCLUDE_DIRS})
-
-# LLVM's include directories are needed for gtest.
-include_directories(SYSTEM ${LLVM_INCLUDE_DIRS})
-
-#===============================================================================
-# Add Subdirectories
-#===============================================================================
-set(FORTRAN_DECIMAL_SRC "${FLANG_SOURCE_DIR}/lib/Decimal")
-set(FORTRAN_RUNTIME_SRC "${FLANG_SOURCE_DIR}/runtime")
-set(FORTRAN_MAIN_SRC "${FLANG_SOURCE_DIR}/runtime/FortranMain")
-
-add_subdirectory(${FORTRAN_DECIMAL_SRC} FortranDecimalRT)
-add_subdirectory(${FORTRAN_RUNTIME_SRC} FortranRuntime)
-add_subdirectory(${FORTRAN_MAIN_SRC} FortranMain)
-
-if (FLANG_RT_INCLUDE_TESTS)
-  add_subdirectory(test)
-  if (FLANG_RT_GTEST_AVAIL)
-    add_subdirectory(unittests)
-  endif()
-endif()
-
-#===============================================================================
-# Create Flang-rt wrapper library
-#===============================================================================
-# Build as shared by default if no linkage type option set.
-if (NOT FLANG_RT_ENABLE_SHARED AND NOT FLANG_RT_ENABLE_STATIC)
-  add_library(flang-rt SHARED $<TARGET_OBJECTS:obj.FortranDecimalRT>
-                              $<TARGET_OBJECTS:obj.FortranRuntime>)
-endif()
-if (FLANG_RT_ENABLE_SHARED)
-  add_library(flang-rt SHARED $<TARGET_OBJECTS:obj.FortranDecimalRT>
-                              $<TARGET_OBJECTS:obj.FortranRuntime>)
-endif()
-if (FLANG_RT_ENABLE_STATIC AND NOT FLANG_RT_ENABLE_SHARED)
-  add_library(flang-rt STATIC $<TARGET_OBJECTS:obj.FortranDecimalRT>
-                              $<TARGET_OBJECTS:obj.FortranRuntime>)
-endif()
-# When building both static and shared, we need to append _static to the name
-# to avoid naming conflicts.
-if (FLANG_RT_ENABLE_STATIC AND FLANG_RT_ENABLE_SHARED)
-  add_library(flang-rt_static STATIC $<TARGET_OBJECTS:obj.FortranDecimalRT>
-                                     $<TARGET_OBJECTS:obj.FortranRuntime>)
-endif()
diff --git a/flang-rt/docs/GettingStarted.md b/flang-rt/docs/GettingStarted.md
deleted file mode 100644
index ffc721aa9b4a4c..00000000000000
--- a/flang-rt/docs/GettingStarted.md
+++ /dev/null
@@ -1,156 +0,0 @@
-<!--===- docs/GettingStarted.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
-# Flang-rt Runtime Library
-
-```eval_rst
-.. contents::
-   :local:
-```
-## What is Flang-rt
-Flang-rt is the runtime library project for Flang. The Flang driver requires
-the Fortran_main and Flang-rt libraries at runtime in order to generate
-user executables. Building this Flang-rt project will build both Fortran_main
-and the Flang-rt library, which is comprised of the FortranRuntime and
-FortranDecimalRT libraries.
-
-### Fortran_main
-Fortran_main is left out of the Flang-rt library because it is required to
-always be static unlike the link type of the Flang-rt library which can be
-configured. Fortran_main implements the main entry point into Fortran's
-`PROGRAM` in Flang by being the bridge between object files generated by Flang 
-and the C runtime that takes care of program set-up at system-level. For
-every Fortran `PROGRAM`, Flang generates the `_QQmain` function.
-Fortran_main implements the C `main` function that simply calls
-`_QQmain`.
-
-### FortranDecimalRT
-In order to decouple the common dependency between compiler and runtime,
-[FortranDecimal's sources](../../flang/lib/Decimal/CMakeLists.txt) are built
-separately for the compiler and the runtime. When the library is built for
-Flang-rt, the name FortranDecimalRT is used to avoid naming conflicts and
-confusion.
-
-### FortranRuntime
-This is the core runtime library in Flang-rt. The sources for this library
-currently still exist in the
-[Flang source directory](../../flang/runtime/CMakeLists.txt). We hope to
-migrate the sources to the Flang-rt directory in the future in order to further
-decouple the runtime from the Flang compiler.
-
-## Building Flang-rt
-Like other LLVM runtimes, Flang-rt can be built by targetting the
-[runtimes LLVM target](../../runtimes/CMakelists.txt). It can also be built
-when targetting the [llvm target](../../llvm/CMakeLists.txt) as an enabled
-runtime. Flang-rt will implicitly be added as an enabled runtime when Flang
-is an enabled project built by llvm. Flang-rt does not support standalone
-builds.
-
-In the future, we may be interested in supporting in optionally building
-Flang-rt when doing a Flang standalone build.
-
-### Building with the llvm target
-Assuming you are building Flang-rt to use with Flang, see
-[Flang's Getting Started guide](../../flang/docs/GettingStarted.md) for more
-information. To build Flang-rt when building the Flang compiler, once you have
-the llvm-project source ready, make a clean build directory. Let root be the
-root directory that you cloned llvm-project into.
-```bash
-cd root
-rm -rf build
-mkdir build
-cd build
-```
-Now invoke the cmake configuration command for llvm that would build Flang with
-Flang-rt.
-```bash
-cmake \
-  -G Ninja \
-  -DLLVM_ENABLE_RUNTIMES="compiler-rt;flang-rt" \
-  -DCMAKE_CXX_STANDARD=17 \
-  -DLLVM_INSTALL_UTILS=On \
-  # New Flang-rt flags for enabled link types
-  -DFLANG_RT_ENABLE_STATIC=On \
-  -DFLANG_RT_ENABLE_SHARED=On \
-  # We need to enable GTest if we want to run Flang-rt's testsuites
-  -DLLVM_INSTALL_GTEST=On \
-  -DFLANG_ENABLE_WERROR=On \
-  -DLLVM_LIT_ARGS=-v \
-  -DCMAKE_BUILD_TYPE=Release \
-  -DLLVM_ENABLE_PROJECTS="clang;flang;lld;mlir;openmp" \
-  ../llvm-project/llvm
-```
-Flang requires other llvm projects (see LLVM_ENABLE_PROJECTS and the [Flang
-Getting Started Guide](../../flang/docs/GettingStarted.md) for more specifics
-on building Flang.
-
-By targetting the LLVM project, we are letting LLVM infrastructure handle
-invoking the runtimes target that will build Flang-rt. This includes finding
-the cmake packages for Clang, LLVM, Flang and MLIR that Flang-rt depends on.
-
-### Building with the runtimes target
-If you already have a pre-built/installed version of LLVM, Flang, Clang and
-MLIR, and would like to build Flang-rt without rebuilding the sources for these
-other projects. You can simply target the runtimes project directly and passing
-the paths to the directories of these files. If you built LLVM as we did above
-with default build directories, your runtimes invocation should look something
-like:
-```bash
-cd build
-BUILDDIR=`pwd`
-
-cmake \
-  -G Ninja \
-  -DCMAKE_CXX_STANDARD=17 \
-  # New Flang-rt flags for enabled link types
-  -DFLANG_RT_ENABLE_SHARED=On \
-  -DFLANG_RT_ENABLE_STATIC=On \
-  -DLLVM_LIT_ARGS=-v \
-  -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,$LD_LIBRARY_PATH" \
-  -DLLVM_TARGETS_TO_BUILD=host \
-  -DLLVM_EXTERNAL_LIT=$BUILD_DIR/bin/llvm-lit \
-  # We need to specify the paths to the cmake packages of the dependencies
-  -DLLVM_DIR=$BUILD_DIR/lib/cmake/llvm \
-  -DMLIR_DIR=$BUILD_DIR/lib/cmake/mlir \
-  -DFLANG_DIR=$BUILD_DIR/lib/cmake/flang \
-  -DLLVM_ENABLE_RUNTIMES="flang-rt" \
-  ../llvm-project/runtimes/
-```
-
-## Library locations
-When building the llvm target with flang as an enabled project, the Flang-rt
-library will be built to `$BUILDDIR/runtimes/runtimes-bins/flang-rt/lib`. When
-building the runtimes target with flang-rt as an enabled runtime, the libraries
-will be built to `$BUILDDIR/flang-rt/lib` by default. In either configuration,
-the Fortran_main library will be built to `$BUILDDIR/lib` by default.
-
-## Using Flang-rt
-The two build paths mentioned above get implicitly added as library paths at the
-invocation of the driver. If Flang-rt is a shared library, you must make the
-dynamic linker aware of where to look. One method to do so is to set the
-environment variable `LD_LIBRARY_PATH` include the path to Flang-rt's directory.
-
-## Options
-Flang-rt introduces 2 CMake options used to configure the library's link type:
-```
-option(FLANG_RT_ENABLE_SHARED "Build flang-rt as a shared library." OFF)
-option(FLANG_RT_ENABLE_STATIC "Build flang-rt as a static library." OFF)
-```
-Both can be specified if you want to build both shared and static versions of
-the Flang-rt runtime. If both are specified, the static library will be named
-Flang-rt_static.a to avoid naming conflicts, as per the LLVM standard.
-
-## Usage Examples
-```bash
-# Example of using Flang with the shared Flang-rt runtime
-# First we need to explicitly tell the dynamic linker where to find Flang-rt
-# since it was built as shared.
-$ $ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$BUILDDIR/runtimes/runtimes-bins/flang-rt/lib"
-$ flang-new -ffree-form hello.f95 -o hello
-```
diff --git a/flang-rt/test/CMakeLists.txt b/flang-rt/test/CMakeLists.txt
deleted file mode 100644
index 51cd6ddf864bc8..00000000000000
--- a/flang-rt/test/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-# Test runner infrastructure for Flang-rt. This configures the Flang-rt test
-# trees for use by Lit, and delegates to LLVM's lit test handlers.
-
-llvm_canonicalize_cmake_booleans(
-  FLANG_STANDALONE_BUILD
-  LLVM_BUILD_EXAMPLES
-  LLVM_BYE_LINK_INTO_TOOLS
-  LLVM_ENABLE_PLUGINS
-)
-
-set(FLANG_TOOLS_DIR ${FLANG_BINARY_DIR}/bin)
-
-configure_lit_site_cfg(
-  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
-  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
-  MAIN_CONFIG
-  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
-  PATHS
-  ${PATHS_FOR_PLUGINS}
-)
-
-configure_lit_site_cfg(
-  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
-  ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py
-  MAIN_CONFIG
-  ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py
-)
-
-configure_lit_site_cfg(
-  ${CMAKE_CURRENT_SOURCE_DIR}/NonGtestUnit/lit.site.cfg.py.in
-  ${CMAKE_CURRENT_BINARY_DIR}/NonGtestUnit/lit.site.cfg.py
-  MAIN_CONFIG
-  ${CMAKE_CURRENT_SOURCE_DIR}/NonGtestUnit/lit.cfg.py
-)
-
-set(FLANG_RT_TEST_PARAMS
-  flang_rt_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py)
-
-set(FLANG_RT_TEST_DEPENDS
-  LLVMSupport
-  flang-rt
-  Fortran_main
-)
-if (LLVM_ENABLE_PLUGINS AND NOT WIN32)
-  list(APPEND FLANG_RT_TEST_DEPENDS Bye)
-endif()
-
-if (FLANG_RT_INCLUDE_TESTS)
-  if (FLANG_RT_GTEST_AVAIL)
-    list(APPEND FLANG_RT_TEST_DEPENDS FlangRTUnitTests)
-  endif()
-endif()
-
-add_custom_target(flang-rt-test-depends DEPENDS ${FLANG_RT_TEST_DEPENDS})
-
-add_lit_testsuite(check-flang-rt "Running the Flang-rt regression tests"
-  ${CMAKE_CURRENT_BINARY_DIR}
-  PARAMS ${FLANG_RT_TEST_PARAMS}
-  DEPENDS ${FLANG_RT_TEST_DEPENDS}
-)
-set_target_properties(check-flang-rt PROPERTIES FOLDER "Tests")
-
-add_lit_testsuites(FLANG_RT ${CMAKE_CURRENT_SOURCE_DIR}
-  PARAMS ${FLANG_RT_TEST_PARAMS}
-  DEPENDS ${FLANG_RT_TEST_DEPENDS})
-
-# To modify the default target triple for flang-rt tests.
-if (DEFINED FLANG_RT_TEST_TARGET_TRIPLE)
-  if (NOT DEFINED LLVM_TARGET_TRIPLE_ENV OR LLVM_TARGET_TRIPLE_ENV STREQUAL "")
-    message(FATAL_ERROR "LLVM_TARGET_TRIPLE_ENV must also be defined in order "
-                        "to use FLANG_RT_TEST_TARGET_TRIPLE.")
-  endif()
-endif()
diff --git a/flang-rt/test/FortranRuntime/no-cpp-dep.c b/flang-rt/test/FortranRuntime/no-cpp-dep.c
deleted file mode 100644
index 19c0260020b528..00000000000000
--- a/flang-rt/test/FortranRuntime/no-cpp-dep.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
-This test makes sure that flang's runtime does not depend on the C++ runtime
-library. It tries to link this simple file against libFortranRuntime.a with
-a C compiler.
-
-REQUIRES: c-compiler
-
-RUN: %cc -std=c99 %s -I%include %libruntime -lm -o /dev/null
-*/
-
-#include "flang/Runtime/entry-names.h"
-#include <stdint.h>
-
-/*
-Manually add declarations for the runtime functions that we want to make sure
-we're testing. We can't include any headers directly since they likely contain
-C++ code that would explode here.
-*/
-struct EnvironmentDefaultList;
-struct Descriptor;
-
-double RTNAME(CpuTime)();
-
-void RTNAME(ProgramStart)(
-    int, const char *[], const char *[], const struct EnvironmentDefaultList *);
-int32_t RTNAME(ArgumentCount)();
-int32_t RTNAME(GetCommandArgument)(int32_t, const struct Descriptor *,
-    const struct Descriptor *, const struct Descriptor *);
-int32_t RTNAME(GetEnvVariable)();
-
-int main() {
-  double x = RTNAME(CpuTime)();
-  RTNAME(ProgramStart)(0, 0, 0, 0);
-  int32_t c = RTNAME(ArgumentCount)();
-  int32_t v = RTNAME(GetCommandArgument)(0, 0, 0, 0);
-  int32_t e = RTNAME(GetEnvVariable)("FOO", 0, 0);
-  return x + c + v + e;
-}
diff --git a/flang-rt/test/NonGtestUnit/lit.cfg.py b/flang-rt/test/NonGtestUnit/lit.cfg.py
deleted file mode 100644
index 32be1d27c8f28c..00000000000000
--- a/flang-rt/test/NonGtestUnit/lit.cfg.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import os
-
-import lit.Test
-
-config.name = "flang-rt-OldUnit"
-
-config.suffixes = [".test"]
-
-config.test_source_root = os.path.join(config.flang_rt_obj_root, "unittests")
-config.test_exec_root = config.test_source_root
-
-config.test_format = lit.formats.ExecutableTest()
-
-path = os.path.pathsep.join(
-    (
-        config.flang_rt_lib_dir,
-        config.flang_bin_dir,
-        config.flang_libs_dir,
-        config.environment.get("LD_LIBRARY_PATH", ""),
-    )
-)
-config.environment["LD_LIBRARY_PATH"] = path
diff --git a/flang-rt/test/NonGtestUnit/lit.site.cfg.py.in b/flang-rt/test/NonGtestUnit/lit.site.cfg.py.in
deleted file mode 100644
index 83d8e0e04adf91..00000000000000
--- a/flang-rt/test/NonGtestUnit/lit.site.cfg.py.in
+++ /dev/null
@@ -1,23 +0,0 @@
-@LIT_SITE_CFG_IN_HEADER@
-
-config.llvm_src_root = "@LLVM_SOURCE_DIR@"
-config.llvm_obj_root = "@LLVM_BINARY_DIR@"
-config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
-config.llvm_libs_dir = lit_config.substitute("@LLVM_LIBS_DIR@")
-config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
-config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
-config.flang_src_dir = "@FLANG_SOURCE_DIR@"
-config.flang_bin_dir = "@FLANG_BINARY_DIR@/bin"
-config.flang_libs_dir = "@FLANG_BINARY_DIR@/lib"
-config.flang_rt_obj_root = "@FLANG_RT_BINARY_DIR@"
-config.flang_tools_dir = lit_config.substitute("@FLANG_TOOLS_DIR@")
-config.flang_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
-config.flang_rt_src_dir = "@FLANG_RT_SOURCE_DIR@"
-config.flang_rt_lib_dir = "@FLANG_RT_BINARY_DIR@/lib"
-config.flang_rt_test_compiler = "@FLANG_RT_TEST_COMPILER@"
-config.flang_rt_test_triple = "@FLANG_RT_TEST_TARGET_TRIPLE@"
-config.target_triple = "@LLVM_TARGET_TRIPLE@"
-config.python_executable = "@Python3_EXECUTABLE@"
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@FLANG_RT_SOURCE_DIR@/test/NonGtestUnit/lit.cfg.py")
diff --git a/flang-rt/test/Unit/lit.cfg.py b/flang-rt/test/Unit/lit.cfg.py
deleted file mode 100644
index 079516fc86d7ae..00000000000000
--- a/flang-rt/test/Unit/lit.cfg.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# -*- Python -*-
-
-# Configuration file for the 'lit' test runner.
-
-import os
-import platform
-import re
-import subprocess
-import sys
-
-import lit.formats
-import lit.util
-
-from lit.llvm import llvm_config
-from lit.llvm.subst import ToolSubst
-from lit.llvm.subst import FindTool
-
-# name: The name of this test suite.
-config.name = "flang-rt-Unit"
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = []
-
-# test_source_root: The root path where unit test binaries are located.
-# test_exec_root: The root path where tests should be run.
-config.test_source_root = os.path.join(config.flang_rt_obj_root, "unittests")
-config.test_exec_root = config.test_source_root
-
-# testFormat: The test format to use to interpret tests.
-config.test_format = lit.formats.GoogleTest(config.llvm_build_mode, "Tests")
-
-# Tweak the PATH to include the flang bin and libs dirs.
-path = os.path.pathsep.join(
-    (
-        config.flang_bin_dir,
-        config.llvm_tools_dir,
-        config.environment["PATH"]
-    )
-)
-config.environment["PATH"] = path
-
-path = os.path.pathsep.join(
-    (
-        config.flang_rt_lib_dir,
-        config.flang_libs_dir,
-        config.flang_bin_dir,
-        config.environment.get("LD_LIBRARY_PATH", ""),
-    )
-)
-config.environment["LD_LIBRARY_PATH"] = path
-
-# Propagate PYTHON_EXECUTABLE into the environment
-# config.environment['PYTHON_EXECUTABLE'] = sys.executable
-
-# To modify the default target triple for flang-rt tests.
-if config.flang_rt_test_triple:
-    config.target_triple = config.flang_rt_test_triple
-    config.environment[config.llvm_target_triple_env] = config.flang_rt_test_triple
diff --git a/flang-rt/test/Unit/lit.site.cfg.py.in b/flang-rt/test/Unit/lit.site.cfg.py.in
deleted file mode 100644
index 4bd4bc40473d87..00000000000000
--- a/flang-rt/test/Unit/lit.site.cfg.py.in
+++ /dev/null
@@ -1,22 +0,0 @@
-@LIT_SITE_CFG_IN_HEADER@
-
-config.llvm_src_root = "@LLVM_SOURCE_DIR@"
-config.llvm_obj_root = "@LLVM_BINARY_DIR@"
-config.llvm_target_triple_env = "@LLVM_TARGET_TRIPLE_ENV@"
-config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
-config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
-config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
-config.flang_bin_dir = "@FLANG_BINARY_DIR@/bin"
-config.flang_src_dir = "@FLANG_SOURCE_DIR@"
-config.flang_libs_dir = "@FLANG_BINARY_DIR@/lib"
-config.flang_rt_obj_root = "@FLANG_RT_BINARY_DIR@"
-config.flang_rt_src_dir = "@FLANG_RT_SOURCE_DIR@"
-config.flang_rt_lib_dir = "@FLANG_RT_BINARY_DIR@/lib"
-config.flang_rt_test_compiler = "@FLANG_RT_TEST_COMPILER@"
-config.flang_rt_test_triple = "@FLANG_RT_TEST_TARGET_TRIPLE@"
-config.flang_tools_dir = lit_config.substitute("@FLANG_TOOLS_DIR@")
-config.target_triple = "@LLVM_TARGET_TRIPLE@"
-config.python_executable = "@Python3_EXECUTABLE@"
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@FLANG_RT_SOURCE_DIR@/test/Unit/lit.cfg.py")
diff --git a/flang-rt/test/lit.cfg.py b/flang-rt/test/lit.cfg.py
deleted file mode 100644
index 97edd482c57c14..00000000000000
--- a/flang-rt/test/lit.cfg.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# -*- Python -*-
-
-import os
-import platform
-import re
-import subprocess
-import sys
-
-import lit.formats
-import lit.util
-
-from lit.llvm import llvm_config
-from lit.llvm.subst import ToolSubst
-from lit.llvm.subst import FindTool
-
-# Configuration file for the 'lit' test runner.
-
-# name: The name of this test suite.
-config.name = "flang-rt"
-
-# testFormat: The test format to use to interpret tests.
-#
-# For now we require '&&' between commands, until they get globally killed and
-# the test runner updated.
-config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = [
-    ".c",
-    ".cpp",
-    ".f",
-    ".F",
-    ".ff",
-    ".FOR",
-    ".for",
-    ".f77",
-    ".f90",
-    ".F90",
-    ".ff90",
-    ".f95",
-    ".F95",
-    ".ff95",
-    ".fpp",
-    ".FPP",
-    ".cuf",
-    ".CUF",
-    ".f18",
-    ".F18",
-    ".f03",
-    ".F03",
-    ".f08",
-    ".F08",
-    ".ll",
-    ".fir",
-    ".mlir",
-]
-
-config.substitutions.append(("%PATH%", config.environment["PATH"]))
-config.substitutions.append(("%pluginext", config.llvm_plugin_ext))
-
-llvm_config.use_default_substitutions()
-
-# ask llvm-config about asserts
-llvm_config.feature_config([("--assertion-mode", {"ON": "asserts"})])
-
-# Targets
-config.targets = frozenset(config.targets_to_build.split())
-for arch in config.targets_to_build.split():
-    config.available_features.add(arch.lower() + "-registered-target")
-
-# To modify the default target triple for flang-rt tests.
-if config.flang_rt_test_triple:
-    config.target_triple = config.flang_rt_test_triple
-    config.environment[config.llvm_target_triple_env] = config.flang_rt_test_triple
-
-# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
-# subdirectories contain auxiliary inputs for various tests in their parent
-# directories.
-config.excludes = ["Inputs", "CMakeLists.txt", "README.txt", "LICENSE.txt"]
-
-# Plugins (loadable modules)
-if config.has_plugins:
-    config.available_features.add("plugins")
-
-if config.linked_bye_extension:
-    config.substitutions.append(("%loadbye", ""))
-else:
-    config.substitutions.append(
-        (
-            "%loadbye",
-            "-fpass-plugin={}/Bye{}".format(
-                config.llvm_shlib_dir, config.llvm_plugin_ext
-            ),
-        )
-    )
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root path where tests should be run.
-config.test_exec_root = os.path.join(config.flang_rt_obj_root, "test")
-
-llvm_config.with_environment("PATH", config.flang_bin_dir, append_path=True)
-llvm_config.with_environment("PATH", config.flang_rt_obj_root, append_path=True)
-llvm_config.with_environment("PATH", config.flang_libs_dir, append_path=True)
-llvm_config.with_environment("PATH", config.flang_rt_lib_dir, append_path=True)
-
-# For each occurrence of a flang tool name, replace it with the full path to
-# the build directory holding that tool.
-tools = [
-    ToolSubst("%flang", command=FindTool("flang-new"), unresolved="fatal"),
-    ToolSubst(
-        "%flang_fc1",
-        command=FindTool("flang-new"),
-        extra_args=["-fc1"],
-        unresolved="fatal",
-    ),
-]
-
-# Flang has several unimplemented features. TODO messages are used to mark
-# and fail if these features are exercised. Some TODOs exit with a non-zero
-# exit code, but others abort the execution in assert builds.
-# To catch aborts, the `--crash` option for the `not` command has to be used.
-tools.append(ToolSubst("%not_todo_cmd", command=FindTool("not"), unresolved="fatal"))
-if "asserts" in config.available_features:
-    tools.append(
-        ToolSubst(
-            "%not_todo_abort_cmd",
-            command=FindTool("not"),
-            extra_args=["--crash"],
-            unresolved="fatal",
-        )
-    )
-else:
-    tools.append(
-        ToolSubst("%not_todo_abort_cmd", command=FindTool("not"), unresolved="fatal")
-    )
-
-# Define some variables to help us test that the flang runtime doesn't depend on
-# the C++ runtime libraries. For this we need a C compiler. If for some reason
-# we don't have one, we can just disable the test.
-if config.cc:
-    libruntime_static = os.path.join(config.flang_rt_lib_dir, "libflang-rt.a")
-    libruntime_shared = os.path.join(config.flang_rt_lib_dir, "libflang-rt.so")
-    include = os.path.join(config.flang_src_dir, "include")
-
-    if (
-        os.path.isfile(libruntime_static)
-        and os.path.isdir(include)
-    ):
-        config.available_features.add("c-compiler")
-        tools.append(ToolSubst("%cc", command=config.cc, unresolved="fatal"))
-        tools.append(ToolSubst("%libruntime", command=libruntime_static, unresolved="fatal"))
-        tools.append(ToolSubst("%include", command=include, unresolved="fatal"))
-
-    elif (
-        os.path.isfile(libruntime_shared)
-        and os.path.isdir(include)
-    ):
-        config.available_features.add("c-compiler")
-        tools.append(ToolSubst("%cc", command=config.cc, unresolved="fatal"))
-        tools.append(ToolSubst("%libruntime", command=libruntime_shared, unresolved="fatal"))
-        tools.append(ToolSubst("%include", command=include, unresolved="fatal"))
-
-
-# Add all the tools and their substitutions (if applicable). Use the search paths provided for
-# finding the tools.
-llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)
-
-# Enable libpgmath testing
-result = lit_config.params.get("LIBPGMATH")
-if result:
-    config.environment["LIBPGMATH"] = True
diff --git a/flang-rt/test/lit.site.cfg.py.in b/flang-rt/test/lit.site.cfg.py.in
deleted file mode 100644
index 8a9873b1eabaf3..00000000000000
--- a/flang-rt/test/lit.site.cfg.py.in
+++ /dev/null
@@ -1,31 +0,0 @@
-@LIT_SITE_CFG_IN_HEADER@
-
-import sys
-
-config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
-config.llvm_shlib_dir = lit_config.substitute(path(r"@SHLIBDIR@"))
-config.llvm_plugin_ext = "@LLVM_PLUGIN_EXT@"
-config.target_triple = "@LLVM_TARGET_TRIPLE@"
-config.llvm_target_triple_env = "@LLVM_TARGET_TRIPLE_ENV@"
-config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
-config.errc_messages = "@LLVM_LIT_ERRC_MESSAGES@"
-config.flang_src_dir = "@FLANG_SOURCE_DIR@"
-config.flang_bin_dir = "@FLANG_BINARY_DIR@/bin"
-config.flang_libs_dir = "@FLANG_BINARY_DIR@/lib"
-config.flang_tools_dir = lit_config.substitute("@FLANG_TOOLS_DIR@")
-config.flang_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
-config.flang_rt_obj_root = "@FLANG_RT_BINARY_DIR@"
-config.flang_rt_src_dir = "@FLANG_RT_SOURCE_DIR@"
-config.flang_rt_lib_dir = "@FLANG_RT_BINARY_DIR@/lib"
-config.flang_rt_test_triple = "@FLANG_RT_TEST_TARGET_TRIPLE@"
-config.python_executable = "@PYTHON_EXECUTABLE@"
-config.has_plugins = @LLVM_ENABLE_PLUGINS@
-config.linked_bye_extension = @LLVM_BYE_LINK_INTO_TOOLS@
-config.cc = "@CMAKE_C_COMPILER@"
-config.targets_to_build = "@TARGETS_TO_BUILD@"
-
-import lit.llvm
-lit.llvm.initialize(lit_config, config)
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@FLANG_RT_SOURCE_DIR@/test/lit.cfg.py")
diff --git a/flang-rt/unittests/CMakeLists.txt b/flang-rt/unittests/CMakeLists.txt
deleted file mode 100644
index 2b813c6e16dfaa..00000000000000
--- a/flang-rt/unittests/CMakeLists.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
-  # If Fortran runtime is built as CUDA library, the linking
-  # of targets that link FortranRuntime must be done
-  # with CUDA_RESOLVE_DEVICE_SYMBOLS.
-  # CUDA language must be enabled for CUDA_RESOLVE_DEVICE_SYMBOLS
-  # to take effect.
-  enable_language(CUDA)
-endif()
-
-add_custom_target(FlangRTUnitTests)
-set_target_properties(FlangRTUnitTests PROPERTIES FOLDER "Flang-rt Unit Tests")
-
-function(add_flang_rt_unittest_offload_properties target)
-  # Set CUDA_RESOLVE_DEVICE_SYMBOLS.
-  if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
-    set_target_properties(${target}
-      PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
-      )
-  endif()
-  # Enable OpenMP offload during linking. We may need to replace
-  # LINK_OPTIONS with COMPILE_OPTIONS when there are OpenMP offload
-  # unittests.
-  #
-  # FIXME: replace 'native' in --offload-arch option with the list
-  #        of targets that Fortran Runtime was built for.
-  #        Common code must be moved from flang/runtime/CMakeLists.txt.
-  # TODO: Revisit this because of Flang-rt. runtime is no longer an added subdirectory of flang.
-  if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "off")
-    set_target_properties(${target}
-        PROPERTIES LINK_OPTIONS
-        "-fopenmp;--offload-arch=native"
-    )
-  endif()
-endfunction()
-
-if(NOT TARGET llvm_gtest)
-  message(FATAL_ERROR "Target llvm_gtest not found.")
-endif()
-
-function(add_flang_rt_unittest test_dirname)
-  add_unittest(FlangRTUnitTests ${test_dirname} ${ARGN})
-  add_flang_rt_unittest_offload_properties(${test_dirname})
-endfunction()
-
-if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG)
-  add_compile_options("-Wno-suggest-override")
-endif()
-
-function(add_flang_rt_nongtest_unittest test_name)
-  cmake_parse_arguments(ARG
-    "SLOW_TEST"
-    ""
-    ""
-    ${ARGN})
-
-  list(APPEND LLVM_COMPILE_FLAGS "-L${LLVM_BINARY_DIR}/lib")
-  if(ARG_SLOW_TEST)
-      set(suffix .slow)
-  else()
-      set(suffix .test)
-  endif()
-
-  add_executable(${test_name}${suffix} ${test_name}.cpp)
-
-  if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
-    set(llvm_libs LLVM)
-  else()
-    llvm_map_components_to_libnames(llvm_libs Support)
-  endif()
-  target_link_libraries(${test_name}${suffix} ${llvm_libs} ${ARG_UNPARSED_ARGUMENTS})
-
-  if(NOT ARG_SLOW_TEST)
-    add_dependencies(FlangRTUnitTests ${test_name}${suffix})
-  endif()
-
-  add_flang_rt_unittest_offload_properties(${test_name}${suffix})
-endfunction()
-
-add_subdirectory(FortranRuntime)
-# TODO: We may want to find a better location for these tests that use the runtime
-add_subdirectory(FortranEvaluate)
diff --git a/flang-rt/unittests/FortranEvaluate/CMakeLists.txt b/flang-rt/unittests/FortranEvaluate/CMakeLists.txt
deleted file mode 100644
index 211018f5559da9..00000000000000
--- a/flang-rt/unittests/FortranEvaluate/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-add_library(FlangRTFortranEvaluateTesting
-  testing.cpp
-)
-if (LLVM_LINK_LLVM_DYLIB)
-  set(llvm_libs LLVM)
-else()
-  llvm_map_components_to_libnames(llvm_libs Support)
-endif()
-target_link_libraries(FlangRTFortranEvaluateTesting
-    ${llvm_libs})
-
-add_flang_rt_nongtest_unittest(reshape
-  FlangRTFortranEvaluateTesting
-  flang-rt
-)
-
-add_flang_rt_nongtest_unittest(ISO-Fortran-binding
-  FlangRTFortranEvaluateTesting
-  flang-rt
-)
diff --git a/flang-rt/unittests/FortranEvaluate/testing.cpp b/flang-rt/unittests/FortranEvaluate/testing.cpp
deleted file mode 100644
index cf0422f621c2fe..00000000000000
--- a/flang-rt/unittests/FortranEvaluate/testing.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-#include "testing.h"
-#include <cstdarg>
-#include <cstdio>
-#include <cstdlib>
-
-namespace testing {
-
-namespace {
-int passes{0};
-int failures{0};
-} // namespace
-
-static void BitBucket(const char *, ...) {}
-
-static void PrintFailureDetails(const char *format, ...) {
-  va_list ap;
-  va_start(ap, format);
-  fputs("\t", stderr);
-  vfprintf(stderr, format, ap);
-  va_end(ap);
-  fputc('\n', stderr);
-}
-
-FailureDetailPrinter Test(const char *file, int line, const char *predicate,
-                          bool pass) {
-  if (pass) {
-    ++passes;
-    return BitBucket;
-  } else {
-    ++failures;
-    fprintf(stderr, "%s:%d: FAIL: %s\n", file, line, predicate);
-    return PrintFailureDetails;
-  }
-}
-
-FailureDetailPrinter Match(const char *file, int line, std::uint64_t want,
-                           const char *gots, std::uint64_t got) {
-  if (want == got) {
-    ++passes;
-    return BitBucket;
-  } else {
-    ++failures;
-    fprintf(stderr, "%s:%d: FAIL: %s == 0x%jx, not 0x%jx\n", file, line, gots,
-            static_cast<std::uintmax_t>(got),
-            static_cast<std::uintmax_t>(want));
-    return PrintFailureDetails;
-  }
-}
-
-FailureDetailPrinter Match(const char *file, int line, const char *want,
-                           const char *gots, const std::string &got) {
-  if (want == got) {
-    ++passes;
-    return BitBucket;
-  } else {
-    ++failures;
-    fprintf(stderr, "%s:%d: FAIL: %s == \"%s\", not \"%s\"\n", file, line, gots,
-            got.data(), want);
-    return PrintFailureDetails;
-  }
-}
-
-FailureDetailPrinter Match(const char *file, int line, const std::string &want,
-                           const char *gots, const std::string &got) {
-  return Match(file, line, want.data(), gots, got);
-}
-
-FailureDetailPrinter Compare(const char *file, int line, const char *xs,
-                             const char *rel, const char *ys, std::uint64_t x,
-                             std::uint64_t y) {
-  while (*rel == ' ') {
-    ++rel;
-  }
-  bool pass{false};
-  if (*rel == '<') {
-    if (rel[1] == '=') {
-      pass = x <= y;
-    } else {
-      pass = x < y;
-    }
-  } else if (*rel == '>') {
-    if (rel[1] == '=') {
-      pass = x >= y;
-    } else {
-      pass = x > y;
-    }
-  } else if (*rel == '=') {
-    pass = x == y;
-  } else if (*rel == '!') {
-    pass = x != y;
-  }
-  if (pass) {
-    ++passes;
-    return BitBucket;
-  } else {
-    ++failures;
-    fprintf(stderr, "%s:%d: FAIL: %s[0x%jx] %s %s[0x%jx]\n", file, line, xs,
-            static_cast<std::uintmax_t>(x), rel, ys,
-            static_cast<std::uintmax_t>(y));
-    return PrintFailureDetails;
-  }
-}
-
-int Complete() {
-  if (failures == 0) {
-    if (passes == 1) {
-      fprintf(stdout, "single test PASSES\n");
-    } else {
-      fprintf(stdout, "all %d tests PASS\n", passes);
-    }
-    passes = 0;
-    return EXIT_SUCCESS;
-  } else {
-    if (passes == 1) {
-      fprintf(stderr, "1 test passes, ");
-    } else {
-      fprintf(stderr, "%d tests pass, ", passes);
-    }
-    if (failures == 1) {
-      fprintf(stderr, "1 test FAILS\n");
-    } else {
-      fprintf(stderr, "%d tests FAIL\n", failures);
-    }
-    passes = failures = 0;
-    return EXIT_FAILURE;
-  }
-}
-} // namespace testing
diff --git a/flang-rt/unittests/FortranEvaluate/testing.h b/flang-rt/unittests/FortranEvaluate/testing.h
deleted file mode 100644
index e4c8dc93f9be0c..00000000000000
--- a/flang-rt/unittests/FortranEvaluate/testing.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef FORTRAN_EVALUATE_TESTING_H_
-#define FORTRAN_EVALUATE_TESTING_H_
-
-#include <cinttypes>
-#include <string>
-
-namespace testing {
-
-// Returns EXIT_SUCCESS or EXIT_FAILURE, so a test's main() should end
-// with "return testing::Complete()".
-int Complete();
-
-// Pass/fail testing.  These macros return a pointer to a printf-like
-// function that can be optionally called to print more detail, e.g.
-//   COMPARE(x, ==, y)("z is 0x%llx", z);
-// will also print z after the usual failure message if x != y.
-#define TEST(predicate)                                                        \
-  testing::Test(__FILE__, __LINE__, #predicate, (predicate))
-#define MATCH(want, got) testing::Match(__FILE__, __LINE__, (want), #got, (got))
-#define COMPARE(x, rel, y)                                                     \
-  testing::Compare(__FILE__, __LINE__, #x, #rel, #y, (x), (y))
-
-// Functions called by these macros; do not call directly.
-using FailureDetailPrinter = void (*)(const char *, ...);
-FailureDetailPrinter Test(const char *file, int line, const char *predicate,
-                          bool pass);
-FailureDetailPrinter Match(const char *file, int line, std::uint64_t want,
-                           const char *gots, std::uint64_t got);
-FailureDetailPrinter Match(const char *file, int line, const char *want,
-                           const char *gots, const std::string &got);
-FailureDetailPrinter Match(const char *file, int line, const std::string &want,
-                           const char *gots, const std::string &got);
-FailureDetailPrinter Compare(const char *file, int line, const char *xs,
-                             const char *rel, const char *ys, std::uint64_t x,
-                             std::uint64_t y);
-} // namespace testing
-#endif // FORTRAN_EVALUATE_TESTING_H_
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 893d5a4750813d..ac30da89995ed3 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -188,7 +188,7 @@ if (FLANG_STANDALONE_BUILD)
   if (FLANG_GTEST_AVAIL)
     add_custom_target(check-all DEPENDS check-flang FlangUnitTests)
   else()
-    add_custom_target(check-all DEPENDS check-flang)
+    add_custom_target(check-all DEPENDS check-flang )
   endif()
   if (LLVM_BUILD_DOCS)
     add_custom_target(doxygen ALL)
@@ -421,11 +421,6 @@ if (FLANG_INCLUDE_TESTS)
   add_compile_definitions(FLANG_INCLUDE_TESTS=1)
 endif()
 
-# Add Flang subdirectories.
-# NOTE: The runtime subdirectory is no longer added here.
-# Sources for the runtime are added in Flang-rt.
-# TODO: Move the runtime sources to the flang-rt top level directory.
-
 add_subdirectory(include)
 add_subdirectory(lib)
 add_subdirectory(cmake/modules)
@@ -435,6 +430,7 @@ option(FLANG_BUILD_TOOLS
 if (FLANG_BUILD_TOOLS)
   add_subdirectory(tools)
 endif()
+add_subdirectory(runtime)
 
 if (LLVM_INCLUDE_EXAMPLES)
   add_subdirectory(examples)
diff --git a/flang/cmake/modules/AddFlang.cmake b/flang/cmake/modules/AddFlang.cmake
index a072f372bcff85..41ce8738e7bf27 100644
--- a/flang/cmake/modules/AddFlang.cmake
+++ b/flang/cmake/modules/AddFlang.cmake
@@ -17,7 +17,7 @@ macro(add_flang_subdirectory name)
 endmacro()
 
 function(add_flang_library name)
-  set(options SHARED STATIC INSTALL_WITH_TOOLCHAIN PIC)
+  set(options SHARED STATIC INSTALL_WITH_TOOLCHAIN)
   set(multiValueArgs ADDITIONAL_HEADERS CLANG_LIBS)
   cmake_parse_arguments(ARG
     "${options}"
@@ -65,9 +65,6 @@ function(add_flang_library name)
   endif()
 
   llvm_add_library(${name} ${LIBTYPE} ${ARG_UNPARSED_ARGUMENTS} ${srcs})
-  if (ARG_PIC)
-    set_target_properties(${name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-  endif()
 
   clang_target_link_libraries(${name} PRIVATE ${ARG_CLANG_LIBS})
 
diff --git a/flang/cmake/modules/FlangConfig.cmake.in b/flang/cmake/modules/FlangConfig.cmake.in
index 359a1dbac617e4..ac4b77bd8514a7 100644
--- a/flang/cmake/modules/FlangConfig.cmake.in
+++ b/flang/cmake/modules/FlangConfig.cmake.in
@@ -9,8 +9,6 @@ find_package(LLVM ${LLVM_VERSION} EXACT REQUIRED CONFIG
 set(FLANG_EXPORTED_TARGETS "@FLANG_EXPORTS@")
 set(FLANG_CMAKE_DIR "@FLANG_CONFIG_CMAKE_DIR@")
 set(FLANG_INCLUDE_DIRS "@FLANG_CONFIG_INCLUDE_DIRS@")
-set(FLANG_SOURCE_DIR "@FLANG_SOURCE_DIR@")
-set(FLANG_BINARY_DIR "@FLANG_BINARY_DIR@")
 
 # Provide all our library targets to users.
 @FLANG_CONFIG_INCLUDE_EXPORTS@
diff --git a/flang/docs/ComplexOperations.md b/flang/docs/ComplexOperations.md
index d035658b90acd1..b7d7a0381bc8ab 100644
--- a/flang/docs/ComplexOperations.md
+++ b/flang/docs/ComplexOperations.md
@@ -1,8 +1,9 @@
 # Complex Operations
 
-```{eval-rst}
-.. toctree::
-   :local:
+```{contents}
+---
+local:
+---
 ```
 
 Fortran includes support for complex number types and a set of operators and
@@ -66,7 +67,7 @@ libm functions.
 
 Similarly to the numerical lowering through the math dialect, certain MLIR
 optimisations could violate the precise floating point model, so when that is
-requested lowering manually emits calls to libm, rather than going through the 
+requested lowering manually emits calls to libm, rather than going through the
 MLIR complex dialect.
 
 The ComplexToStandard dialect does still call into libm for some floating
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index c1591eb49732b7..480039911719c6 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -101,6 +101,8 @@ end
 * `$` and `@` as legal characters in names
 * Initialization in type declaration statements using `/values/`
 * Saved variables without explicit or default initializers are zero initialized.
+* In a saved entity of a type with a default initializer, components without default
+  values are zero initialized.
 * Kind specification with `*`, e.g. `REAL*4`
 * `DOUBLE COMPLEX` as a synonym for `COMPLEX(KIND(0.D0))` --
   but not when spelled `TYPE(DOUBLECOMPLEX)`.
diff --git a/flang/docs/conf.py b/flang/docs/conf.py
index a90343ef8e05e6..9b452373737470 100644
--- a/flang/docs/conf.py
+++ b/flang/docs/conf.py
@@ -31,6 +31,7 @@
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
+myst_heading_anchors = 6
 
 import sphinx
 
diff --git a/flang/include/flang/Frontend/LangOptions.def b/flang/include/flang/Frontend/LangOptions.def
index b7b244a58253d6..3a1d44f7fb472d 100644
--- a/flang/include/flang/Frontend/LangOptions.def
+++ b/flang/include/flang/Frontend/LangOptions.def
@@ -53,5 +53,8 @@ LANGOPT(OpenMPNoThreadState, 1, 0)
 /// Assume that no thread in a parallel region will encounter a parallel region
 LANGOPT(OpenMPNoNestedParallelism, 1, 0)
 
+LANGOPT(VScaleMin, 32, 0)  ///< Minimum vscale range value
+LANGOPT(VScaleMax, 32, 0)  ///< Maximum vscale range value
+
 #undef LANGOPT
 #undef ENUM_LANGOPT
diff --git a/flang/include/flang/ISO_Fortran_binding.h b/flang/include/flang/ISO_Fortran_binding.h
index 59e31462ab5aac..2893fd46c267d9 100644
--- a/flang/include/flang/ISO_Fortran_binding.h
+++ b/flang/include/flang/ISO_Fortran_binding.h
@@ -127,8 +127,8 @@ namespace cfi_internal {
 template <typename T> struct FlexibleArray : T {
   RT_API_ATTRS T &operator[](int index) { return *(this + index); }
   const RT_API_ATTRS T &operator[](int index) const { return *(this + index); }
-  operator T *() { return this; }
-  operator const T *() const { return this; }
+  RT_API_ATTRS operator T *() { return this; }
+  RT_API_ATTRS operator const T *() const { return this; }
 };
 } // namespace cfi_internal
 #endif
@@ -182,19 +182,20 @@ template <> struct CdescStorage<0> : public CFI_cdesc_t {};
 #ifdef __cplusplus
 extern "C" {
 #endif
-void *CFI_address(const CFI_cdesc_t *, const CFI_index_t subscripts[]);
-int CFI_allocate(CFI_cdesc_t *, const CFI_index_t lower_bounds[],
+RT_API_ATTRS void *CFI_address(
+    const CFI_cdesc_t *, const CFI_index_t subscripts[]);
+RT_API_ATTRS int CFI_allocate(CFI_cdesc_t *, const CFI_index_t lower_bounds[],
     const CFI_index_t upper_bounds[], size_t elem_len);
 RT_API_ATTRS int CFI_deallocate(CFI_cdesc_t *);
 int CFI_establish(CFI_cdesc_t *, void *base_addr, CFI_attribute_t, CFI_type_t,
     size_t elem_len, CFI_rank_t, const CFI_index_t extents[]);
-int CFI_is_contiguous(const CFI_cdesc_t *);
+RT_API_ATTRS int CFI_is_contiguous(const CFI_cdesc_t *);
 RT_API_ATTRS int CFI_section(CFI_cdesc_t *, const CFI_cdesc_t *source,
     const CFI_index_t lower_bounds[], const CFI_index_t upper_bounds[],
     const CFI_index_t strides[]);
-int CFI_select_part(CFI_cdesc_t *, const CFI_cdesc_t *source,
+RT_API_ATTRS int CFI_select_part(CFI_cdesc_t *, const CFI_cdesc_t *source,
     size_t displacement, size_t elem_len);
-int CFI_setpointer(
+RT_API_ATTRS int CFI_setpointer(
     CFI_cdesc_t *, const CFI_cdesc_t *source, const CFI_index_t lower_bounds[]);
 #ifdef __cplusplus
 } // extern "C"
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 477c8164a18eaf..c792e75f114649 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -212,12 +212,10 @@ class AbstractConverter {
 
   /// Register a runtime derived type information object symbol to ensure its
   /// object will be generated as a global.
-  virtual void registerRuntimeTypeInfo(mlir::Location loc,
-                                       SymbolRef typeInfoSym) = 0;
-
-  virtual void registerDispatchTableInfo(
-      mlir::Location loc,
-      const Fortran::semantics::DerivedTypeSpec *typeSpec) = 0;
+  virtual void
+  registerTypeInfo(mlir::Location loc, SymbolRef typeInfoSym,
+                   const Fortran::semantics::DerivedTypeSpec &typeSpec,
+                   fir::RecordType type) = 0;
 
   //===--------------------------------------------------------------------===//
   // Locations
diff --git a/flang/include/flang/Lower/HlfirIntrinsics.h b/flang/include/flang/Lower/HlfirIntrinsics.h
index df1f1ac9a7cf5a..088f8bccef4aa1 100644
--- a/flang/include/flang/Lower/HlfirIntrinsics.h
+++ b/flang/include/flang/Lower/HlfirIntrinsics.h
@@ -19,6 +19,8 @@
 #define FORTRAN_LOWER_HLFIRINTRINSICS_H
 
 #include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cassert>
 #include <optional>
@@ -46,18 +48,71 @@ struct PreparedActualArgument {
   PreparedActualArgument(hlfir::Entity actual,
                          std::optional<mlir::Value> isPresent)
       : actual{actual}, isPresent{isPresent} {}
+  PreparedActualArgument(hlfir::ElementalAddrOp vectorSubscriptedActual)
+      : actual{vectorSubscriptedActual}, isPresent{std::nullopt} {}
   void setElementalIndices(mlir::ValueRange &indices) {
     oneBasedElementalIndices = &indices;
   }
-  hlfir::Entity getActual(mlir::Location loc,
-                          fir::FirOpBuilder &builder) const {
-    if (oneBasedElementalIndices)
-      return hlfir::getElementAt(loc, builder, actual,
-                                 *oneBasedElementalIndices);
-    return actual;
+
+  /// Get the prepared actual. If this is an array argument in an elemental
+  /// call, the current element value will be returned.
+  hlfir::Entity getActual(mlir::Location loc, fir::FirOpBuilder &builder) const;
+
+  void derefPointersAndAllocatables(mlir::Location loc,
+                                    fir::FirOpBuilder &builder) {
+    if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual))
+      actual = hlfir::derefPointersAndAllocatables(loc, builder, *actualEntity);
+  }
+
+  void loadTrivialScalar(mlir::Location loc, fir::FirOpBuilder &builder) {
+    if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual))
+      actual = hlfir::loadTrivialScalar(loc, builder, *actualEntity);
+  }
+
+  /// Ensure an array expression argument is fully evaluated in memory before
+  /// the call. Useful for impure elemental calls.
+  hlfir::AssociateOp associateIfArrayExpr(mlir::Location loc,
+                                          fir::FirOpBuilder &builder) {
+    if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual)) {
+      if (!actualEntity->isVariable() && actualEntity->isArray()) {
+        mlir::Type storageType = actualEntity->getType();
+        hlfir::AssociateOp associate = hlfir::genAssociateExpr(
+            loc, builder, *actualEntity, storageType, "adapt.impure_arg_eval");
+        actual = hlfir::Entity{associate};
+        return associate;
+      }
+    }
+    return {};
+  }
+
+  bool isArray() const {
+    return std::holds_alternative<hlfir::ElementalAddrOp>(actual) ||
+           std::get<hlfir::Entity>(actual).isArray();
   }
-  hlfir::Entity getOriginalActual() const { return actual; }
-  void setOriginalActual(hlfir::Entity newActual) { actual = newActual; }
+
+  mlir::Value genShape(mlir::Location loc, fir::FirOpBuilder &builder) {
+    if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual))
+      return hlfir::genShape(loc, builder, *actualEntity);
+    return std::get<hlfir::ElementalAddrOp>(actual).getShape();
+  }
+
+  mlir::Value genCharLength(mlir::Location loc, fir::FirOpBuilder &builder) {
+    if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual))
+      return hlfir::genCharLength(loc, builder, *actualEntity);
+    auto typeParams = std::get<hlfir::ElementalAddrOp>(actual).getTypeparams();
+    assert(typeParams.size() == 1 &&
+           "failed to retrieve vector subscripted character length");
+    return typeParams[0];
+  }
+
+  /// When the argument is polymorphic, get mold value with the same dynamic
+  /// type.
+  mlir::Value getPolymorphicMold(mlir::Location loc) const {
+    if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual))
+      return *actualEntity;
+    TODO(loc, "polymorphic vector subscripts");
+  }
+
   bool handleDynamicOptional() const { return isPresent.has_value(); }
   mlir::Value getIsPresent() const {
     assert(handleDynamicOptional() && "not a dynamic optional");
@@ -67,7 +122,7 @@ struct PreparedActualArgument {
   void resetOptionalAspect() { isPresent = std::nullopt; }
 
 private:
-  hlfir::Entity actual;
+  std::variant<hlfir::Entity, hlfir::ElementalAddrOp> actual;
   mlir::ValueRange *oneBasedElementalIndices{nullptr};
   // When the actual may be dynamically optional, "isPresent"
   // holds a boolean value indicating the presence of the
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index bba8bc23c26aa5..0b36186d68a461 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -254,11 +254,6 @@ class FirOpBuilder : public mlir::OpBuilder, public mlir::OpBuilder::Listener {
                         bodyBuilder, linkage);
   }
 
-  /// Create a fir::DispatchTable operation.
-  fir::DispatchTableOp createDispatchTableOp(mlir::Location loc,
-                                             llvm::StringRef name,
-                                             llvm::StringRef parentName);
-
   /// Convert a StringRef string into a fir::StringLitOp.
   fir::StringLitOp createStringLitOp(mlir::Location loc,
                                      llvm::StringRef string);
diff --git a/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h b/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
index b94782bb9ea984..3aeb124f911867 100644
--- a/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
+++ b/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
@@ -188,6 +188,9 @@ class TBAABuilder {
   // Returns TBAATagAttr representing access tag:
   //   < <any data access>, <any data access>, 0 >
   mlir::LLVM::TBAATagAttr getAnyDataAccessTag();
+  // Returns TBAATagAttr representing access tag:
+  //   < <any access>, <any access>, 0 >
+  mlir::LLVM::TBAATagAttr getAnyAccessTag();
 
   // Returns TBAATagAttr representing access tag described by the base and
   // access FIR types and the LLVM::GepOp representing the access in terms of
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index a57add9f731979..0329a5325a21c9 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2778,6 +2778,10 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
           (*this)->getAttrOfType<mlir::StringAttr>(
               mlir::SymbolTable::getSymbolAttrName()).getValue());
     }
+
+    bool isInitialized() {
+      return getInitVal() || hasInitializationBody();
+    }
   }];
 }
 
@@ -2809,20 +2813,31 @@ def fir_GlobalLenOp : fir_Op<"global_len", []> {
 
 def ImplicitFirTerminator : SingleBlockImplicitTerminator<"FirEndOp">;
 
-def fir_DispatchTableOp : fir_Op<"dispatch_table",
+def fir_TypeInfoOp : fir_Op<"type_info",
     [IsolatedFromAbove, Symbol, ImplicitFirTerminator]> {
-  let summary = "Dispatch table definition";
+  let summary = "Derived type information";
 
   let description = [{
-    Define a dispatch table for a derived type with type-bound procedures.
+    Define extra information about a !fir.type<> that represents
+    a Fortran derived type.
 
-    A dispatch table is an untyped symbol that contains a list of associations
+    The optional dispatch table region defines a dispatch table with the derived
+    type type-bound procedures. It contains a list of associations
     between method identifiers and corresponding `FuncOp` symbols.
-
     The ordering of associations in the map is determined by the front end.
 
+    The "no_init" flag indicates that this type has no components requiring default
+    initialization (including setting allocatable component to a clean deallocated
+    state).
+
+    The "no_destroy" flag indicates that there are no allocatable components
+    that require deallocation.
+
+    The "no_final" flag indicates that there are no final methods for this type,
+    for its parents ,or for components.
+
     ```mlir
-      fir.dispatch_table @_QDTMquuzTfoo {
+      fir.type_info @_QMquuzTfoo noinit nofinal : !fir.type<_QMquuzTfoo{i:i32}> dispatch_table {
         fir.dt_entry method1, @_QFNMquuzTfooPmethod1AfooR
         fir.dt_entry method2, @_QFNMquuzTfooPmethod2AfooII
       }
@@ -2831,32 +2846,46 @@ def fir_DispatchTableOp : fir_Op<"dispatch_table",
 
   let arguments = (ins
     SymbolNameAttr:$sym_name,
-    OptionalAttr<StrAttr>:$parent
+    TypeAttr:$type,
+    OptionalAttr<TypeAttr>:$parent_type,
+    UnitAttr:$no_init,
+    UnitAttr:$no_destroy,
+    UnitAttr:$no_final
   );
 
-  let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
 
-  let regions = (region AnyRegion:$region);
+  let regions = (region MaxSizedRegion<1>:$dispatch_table);
 
-  let skipDefaultBuilders = 1;
   let builders = [
-    OpBuilder<(ins "llvm::StringRef":$name, "mlir::Type":$type,
-      "llvm::StringRef":$parent,
+    OpBuilder<(ins "fir::RecordType":$type, "fir::RecordType":$parent_type,
       CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attrs)>
   ];
 
-  let extraClassDeclaration = [{
-    static constexpr llvm::StringRef getParentAttrNameStr() { return "parent"; }
-    static constexpr llvm::StringRef getExtendsKeyword() { return "extends"; }
+  let assemblyFormat = [{
+    $sym_name (`noinit` $no_init^)? (`nodestroy` $no_destroy^)?
+    (`nofinal` $no_final^)? (`extends` $parent_type^)? attr-dict `:` $type
+    (`dispatch_table` $dispatch_table^)?
+  }];
 
-    mlir::Block &getBlock() {
-      return getRegion().front();
+  let extraClassDeclaration = [{
+    fir::RecordType getRecordType() {
+      return mlir::cast<fir::RecordType>(getType());
+    }
+    fir::RecordType getIfParentType() {
+      if (auto parentType = getParentType())
+        return mlir::cast<fir::RecordType>(*parentType);
+      return {};
+    }
+    std::optional<llvm::StringRef> getIfParentName() {
+      if (auto parentType = getIfParentType())
+        return parentType.getName();
+      return std::nullopt;
     }
   }];
 }
 
-def fir_DTEntryOp : fir_Op<"dt_entry", [HasParent<"DispatchTableOp">]> {
+def fir_DTEntryOp : fir_Op<"dt_entry", [HasParent<"TypeInfoOp">]> {
   let summary = "map entry in a dispatch table";
 
   let description = [{
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index 77807ea2a308c6..2abcc6547bbabf 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -333,6 +333,12 @@ bool isUnlimitedPolymorphicType(mlir::Type ty);
 /// Return true iff `ty` is the type of an assumed type.
 bool isAssumedType(mlir::Type ty);
 
+/// Return true iff `ty` is the type of an assumed shape array.
+bool isAssumedShape(mlir::Type ty);
+
+/// Return true iff `ty` is the type of an allocatable array.
+bool isAllocatableOrPointerArray(mlir::Type ty);
+
 /// Return true iff `boxTy` wraps a record type or an unlimited polymorphic
 /// entity. Polymorphic entities with intrinsic type spec do not have addendum
 inline bool boxHasAddendum(fir::BaseBoxType boxTy) {
@@ -349,6 +355,10 @@ mlir::Type unwrapInnerType(mlir::Type ty);
 /// Return true iff `ty` is a RecordType with members that are allocatable.
 bool isRecordWithAllocatableMember(mlir::Type ty);
 
+/// Return true iff `ty` is a scalar/array of RecordType
+/// with members that are descriptors.
+bool isRecordWithDescriptorMember(mlir::Type ty);
+
 /// Return true iff `ty` is a RecordType with type parameters.
 inline bool isRecordWithTypeParameters(mlir::Type ty) {
   if (auto recTy = ty.dyn_cast_or_null<fir::RecordType>())
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index ebf506543ebf4a..d5b045924f3c0c 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -36,21 +36,22 @@ using BindingTables = llvm::DenseMap<llvm::StringRef, BindingTable>;
 inline void buildBindingTables(BindingTables &bindingTables,
                                mlir::ModuleOp mod) {
 
-  // The binding tables are defined in FIR from lowering as fir.dispatch_table
-  // operation. Go through each binding tables and store the procedure name and
+  // The binding tables are defined in FIR after lowering inside fir.type_info
+  // operations. Go through each binding tables and store the procedure name and
   // binding index for later use by the fir.dispatch conversion pattern.
-  for (auto dispatchTableOp : mod.getOps<fir::DispatchTableOp>()) {
+  for (auto typeInfo : mod.getOps<fir::TypeInfoOp>()) {
     unsigned bindingIdx = 0;
     BindingTable bindings;
-    if (dispatchTableOp.getRegion().empty()) {
-      bindingTables[dispatchTableOp.getSymName()] = bindings;
+    if (typeInfo.getDispatchTable().empty()) {
+      bindingTables[typeInfo.getSymName()] = bindings;
       continue;
     }
-    for (auto dtEntry : dispatchTableOp.getBlock().getOps<fir::DTEntryOp>()) {
+    for (auto dtEntry :
+         typeInfo.getDispatchTable().front().getOps<fir::DTEntryOp>()) {
       bindings[dtEntry.getMethod()] = bindingIdx;
       ++bindingIdx;
     }
-    bindingTables[dispatchTableOp.getSymName()] = bindings;
+    bindingTables[typeInfo.getSymName()] = bindings;
   }
 }
 
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 8aeb3e373298e8..64882c8ec406b0 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -80,6 +80,10 @@ std::unique_ptr<mlir::Pass> createOMPFunctionFilteringPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createOMPMarkDeclareTargetPass();
 
+std::unique_ptr<mlir::Pass> createVScaleAttrPass();
+std::unique_ptr<mlir::Pass>
+createVScaleAttrPass(std::pair<unsigned, unsigned> vscaleAttr);
+
 // declarative passes
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/Transforms/Passes.h.inc"
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 9474edf13ce463..80da485392007f 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -326,4 +326,18 @@ def OMPFunctionFiltering : Pass<"omp-function-filtering"> {
   ];
 }
 
+def VScaleAttr : Pass<"vscale-attr", "mlir::func::FuncOp"> {
+  let summary = "Add vscale_range attribute to functions";
+  let description = [{
+     Set an attribute for the vscale range on functions, to allow scalable
+     vector operations to be used on processors with variable vector length.
+  }];
+  let options = [
+    Option<"vscaleRange", "vscale-range",
+           "std::pair<unsigned, unsigned>", /*default=*/"std::pair<unsigned, unsigned>{}",
+           "vector scale range">,
+  ];
+  let constructor = "::fir::createVScaleAttrPass()";
+}
+
 #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
diff --git a/flang/include/flang/Runtime/api-attrs.h b/flang/include/flang/Runtime/api-attrs.h
index 7420472aed670e..61da2c06d3a4dc 100644
--- a/flang/include/flang/Runtime/api-attrs.h
+++ b/flang/include/flang/Runtime/api-attrs.h
@@ -45,7 +45,7 @@
 /*
  * RT_OFFLOAD_API_GROUP_BEGIN/END pair is placed around definitions
  * of functions that can be referenced in other modules of Flang
- * runtime. For OpenMP offload these functions are made "declare target"
+ * runtime. For OpenMP offload, these functions are made "declare target"
  * making sure they are compiled for the target even though direct
  * references to them from other "declare target" functions may not
  * be seen. Host-only functions should not be put in between these
@@ -54,6 +54,15 @@
 #define RT_OFFLOAD_API_GROUP_BEGIN RT_EXT_API_GROUP_BEGIN
 #define RT_OFFLOAD_API_GROUP_END RT_EXT_API_GROUP_END
 
+/*
+ * RT_OFFLOAD_VAR_GROUP_BEGIN/END pair is placed around definitions
+ * of variables (e.g. globals or static class members) that can be
+ * referenced in functions marked with RT_OFFLOAD_API_GROUP_BEGIN/END.
+ * For OpenMP offload, these variables are made "declare target".
+ */
+#define RT_OFFLOAD_VAR_GROUP_BEGIN RT_EXT_API_GROUP_BEGIN
+#define RT_OFFLOAD_VAR_GROUP_END RT_EXT_API_GROUP_END
+
 /*
  * RT_VAR_GROUP_BEGIN/END pair is placed around definitions
  * of module scope variables referenced by Flang runtime (directly
diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h
index 779997dab61868..b19c02f44c73bc 100644
--- a/flang/include/flang/Runtime/assign.h
+++ b/flang/include/flang/Runtime/assign.h
@@ -30,23 +30,23 @@ class Descriptor;
 
 extern "C" {
 // API for lowering assignment
-void RTNAME(Assign)(Descriptor &to, const Descriptor &from,
+void RTDECL(Assign)(Descriptor &to, const Descriptor &from,
     const char *sourceFile = nullptr, int sourceLine = 0);
 // This variant has no finalization, defined assignment, or allocatable
 // reallocation.
-void RTNAME(AssignTemporary)(Descriptor &to, const Descriptor &from,
+void RTDECL(AssignTemporary)(Descriptor &to, const Descriptor &from,
     const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(CopyOutAssign)(Descriptor &to, const Descriptor &from,
+void RTDECL(CopyOutAssign)(Descriptor &to, const Descriptor &from,
     bool skipToInit, const char *sourceFile = nullptr, int sourceLine = 0);
 // This variant is for assignments to explicit-length CHARACTER left-hand
 // sides that might need to handle truncation or blank-fill, and
 // must maintain the character length even if an allocatable array
 // is reallocated.
-void RTNAME(AssignExplicitLengthCharacter)(Descriptor &to,
+void RTDECL(AssignExplicitLengthCharacter)(Descriptor &to,
     const Descriptor &from, const char *sourceFile = nullptr,
     int sourceLine = 0);
 // This variant is assignments to whole polymorphic allocatables.
-void RTNAME(AssignPolymorphic)(Descriptor &to, const Descriptor &from,
+void RTDECL(AssignPolymorphic)(Descriptor &to, const Descriptor &from,
     const char *sourceFile = nullptr, int sourceLine = 0);
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index d26139321227fc..c9a3b1b0310077 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -427,11 +427,13 @@ static_assert(sizeof(Descriptor) == sizeof(ISO::CFI_cdesc_t));
 template <int MAX_RANK = maxRank, bool ADDENDUM = false, int MAX_LEN_PARMS = 0>
 class alignas(Descriptor) StaticDescriptor {
 public:
+  RT_OFFLOAD_VAR_GROUP_BEGIN
   static constexpr int maxRank{MAX_RANK};
   static constexpr int maxLengthTypeParameters{MAX_LEN_PARMS};
   static constexpr bool hasAddendum{ADDENDUM || MAX_LEN_PARMS > 0};
   static constexpr std::size_t byteSize{
       Descriptor::SizeInBytes(maxRank, hasAddendum, maxLengthTypeParameters)};
+  RT_OFFLOAD_VAR_GROUP_END
 
   RT_API_ATTRS Descriptor &descriptor() {
     return *reinterpret_cast<Descriptor *>(storage_);
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index ab48ef422801a4..0b5c3dde2e7208 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -180,6 +180,8 @@ const Symbol *HasImpureFinal(const Symbol &);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
 bool MayRequireFinalization(const DerivedTypeSpec &derived);
+// Does this type have an allocatable direct component?
+bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived);
 
 bool IsInBlankCommon(const Symbol &);
 inline bool IsAssumedSizeArray(const Symbol &symbol) {
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 616d9ddc066a75..76d18b73aee20a 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -16,7 +16,6 @@
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
 #include "flang/Optimizer/Transforms/Passes.h"
-#include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -184,29 +183,28 @@ inline void addExternalNameConversionPass(
 /// incremental conversion of FIR.
 ///
 /// \param pm - MLIR pass manager that will hold the pipeline definition
-inline void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
-    llvm::OptimizationLevel optLevel = defaultOptLevel,
-    bool stackArrays = false, bool loopVersioning = false) {
+inline void createDefaultFIROptimizerPassPipeline(
+    mlir::PassManager &pm, const MLIRToLLVMPassPipelineConfig &pc) {
   // simplify the IR
   mlir::GreedyRewriteConfig config;
   config.enableRegionSimplification = false;
   pm.addPass(mlir::createCSEPass());
-  fir::addAVC(pm, optLevel);
+  fir::addAVC(pm, pc.OptLevel);
   pm.addNestedPass<mlir::func::FuncOp>(fir::createCharacterConversionPass());
   pm.addPass(mlir::createCanonicalizerPass(config));
   pm.addPass(fir::createSimplifyRegionLitePass());
-  if (optLevel.isOptimizingForSpeed()) {
+  if (pc.OptLevel.isOptimizingForSpeed()) {
     // These passes may increase code size.
     pm.addPass(fir::createSimplifyIntrinsicsPass());
     pm.addPass(fir::createAlgebraicSimplificationPass(config));
   }
 
-  if (loopVersioning)
+  if (pc.LoopVersioning)
     pm.addPass(fir::createLoopVersioningPass());
 
   pm.addPass(mlir::createCSEPass());
 
-  if (stackArrays)
+  if (pc.StackArrays)
     pm.addPass(fir::createStackArraysPass());
   else
     fir::addMemoryAllocationOpt(pm);
@@ -291,19 +289,21 @@ inline void createDebugPasses(
   }
 }
 
-inline void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
-    llvm::OptimizationLevel optLevel = defaultOptLevel,
-    bool underscoring = true,
-    llvm::codegenoptions::DebugInfoKind debugInfo = NoDebugInfo) {
+inline void createDefaultFIRCodeGenPassPipeline(
+    mlir::PassManager &pm, MLIRToLLVMPassPipelineConfig config) {
   fir::addBoxedProcedurePass(pm);
   pm.addNestedPass<mlir::func::FuncOp>(
       fir::createAbstractResultOnFuncOptPass());
   pm.addNestedPass<fir::GlobalOp>(fir::createAbstractResultOnGlobalOptPass());
   fir::addCodeGenRewritePass(pm);
   fir::addTargetRewritePass(pm);
-  fir::addExternalNameConversionPass(pm, underscoring);
-  fir::createDebugPasses(pm, debugInfo);
-  fir::addFIRToLLVMPass(pm, optLevel);
+  fir::addExternalNameConversionPass(pm, config.Underscoring);
+  fir::createDebugPasses(pm, config.DebugInfo);
+
+  if (config.VScaleMin != 0)
+    pm.addPass(fir::createVScaleAttrPass({config.VScaleMin, config.VScaleMax}));
+
+  fir::addFIRToLLVMPass(pm, config.OptLevel);
 }
 
 /// Create a pass pipeline for lowering from MLIR to LLVM IR
@@ -311,20 +311,15 @@ inline void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
-inline void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
-    llvm::OptimizationLevel optLevel = defaultOptLevel,
-    bool stackArrays = false, bool underscoring = true,
-    bool loopVersioning = false,
-    llvm::codegenoptions::DebugInfoKind debugInfo = NoDebugInfo) {
-  fir::createHLFIRToFIRPassPipeline(pm, optLevel);
+inline void createMLIRToLLVMPassPipeline(
+    mlir::PassManager &pm, const MLIRToLLVMPassPipelineConfig &config) {
+  fir::createHLFIRToFIRPassPipeline(pm, config.OptLevel);
 
   // Add default optimizer pass pipeline.
-  fir::createDefaultFIROptimizerPassPipeline(
-      pm, optLevel, stackArrays, loopVersioning);
+  fir::createDefaultFIROptimizerPassPipeline(pm, config);
 
   // Add codegen pass pipeline.
-  fir::createDefaultFIRCodeGenPassPipeline(
-      pm, optLevel, underscoring, debugInfo);
+  fir::createDefaultFIRCodeGenPassPipeline(pm, config);
 }
 #undef FLANG_EXCLUDE_CODEGEN
 #endif
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index d43a1fccccb62f..6245a2f1376fcc 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -13,11 +13,38 @@
 #ifndef FORTRAN_TOOLS_CROSS_TOOL_HELPERS_H
 #define FORTRAN_TOOLS_CROSS_TOOL_HELPERS_H
 
+#include "flang/Frontend/CodeGenOptions.h"
 #include "flang/Frontend/LangOptions.h"
 #include <cstdint>
 
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "llvm/Frontend/Debug/Options.h"
+#include "llvm/Passes/OptimizationLevel.h"
+
+/// Configuriation for the MLIR to LLVM pass pipeline.
+struct MLIRToLLVMPassPipelineConfig {
+  explicit MLIRToLLVMPassPipelineConfig(llvm::OptimizationLevel level) {
+    OptLevel = level;
+  }
+  explicit MLIRToLLVMPassPipelineConfig(llvm::OptimizationLevel level,
+      const Fortran::frontend::CodeGenOptions &opts) {
+    OptLevel = level;
+    StackArrays = opts.StackArrays;
+    Underscoring = opts.Underscoring;
+    LoopVersioning = opts.LoopVersioning;
+    DebugInfo = opts.getDebugInfo();
+  }
+
+  llvm::OptimizationLevel OptLevel; ///< optimisation level
+  bool StackArrays = false; ///< convert memory allocations to alloca.
+  bool Underscoring = true; ///< add underscores to function names.
+  bool LoopVersioning = false; ///< Run the version loop pass.
+  llvm::codegenoptions::DebugInfoKind DebugInfo =
+      llvm::codegenoptions::NoDebugInfo; ///< Debug info generation.
+  unsigned VScaleMin = 0; ///< SVE vector range minimum.
+  unsigned VScaleMax = 0; ///< SVE vector range maximum.
+};
 
 struct OffloadModuleOpts {
   OffloadModuleOpts() {}
diff --git a/flang/lib/Decimal/CMakeLists.txt b/flang/lib/Decimal/CMakeLists.txt
index 211c19693de810..3116ff68ea2627 100644
--- a/flang/lib/Decimal/CMakeLists.txt
+++ b/flang/lib/Decimal/CMakeLists.txt
@@ -49,19 +49,7 @@ endif()
 # avoid an unwanted dependency on libstdc++.so.
 add_definitions(-U_GLIBCXX_ASSERTIONS)
 
-# Build FortranDecimal when the build target is Flang or LLVM.
-if (CMAKE_SOURCE_DIR STREQUAL FLANG_SOURCE_DIR OR CMAKE_SOURCE_DIR STREQUAL LLVM_MAIN_SRC_DIR)
-  add_flang_library(FortranDecimal
-    binary-to-decimal.cpp
-    decimal-to-binary.cpp
-  )
-# Build FortranDecimalRT for FlangRT when the build target is Runtimes.
-# Standalone builds of FlangRT is not supported.
-elseif (CMAKE_SOURCE_DIR STREQUAL Runtimes_SOURCE_DIR)
-  add_flang_library(FortranDecimalRT STATIC INSTALL_WITH_TOOLCHAIN PIC
-    binary-to-decimal.cpp
-    decimal-to-binary.cpp
-  )
-else()
-  message(FATAL_ERROR "CMAKE_SOURCE_DIR of target points to neither Flang or Flang-rt, no library added for FortranDecimal.")
-endif()
+add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN
+  binary-to-decimal.cpp
+  decimal-to-binary.cpp
+)
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 6c0b90cfac4341..37315e0e4d27a7 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -986,6 +986,41 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
   return true;
 }
 
+/// Parses vscale range options and populates the CompilerInvocation
+/// accordingly.
+/// Returns false if new errors are generated.
+///
+/// \param [out] invoc Stores the processed arguments
+/// \param [in] args The compiler invocation arguments to parse
+/// \param [out] diags DiagnosticsEngine to report erros with
+static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args,
+                            clang::DiagnosticsEngine &diags) {
+  LangOptions &opts = invoc.getLangOpts();
+  if (const auto arg =
+          args.getLastArg(clang::driver::options::OPT_mvscale_min_EQ)) {
+    llvm::StringRef argValue = llvm::StringRef(arg->getValue());
+    unsigned VScaleMin;
+    if (argValue.getAsInteger(/*Radix=*/10, VScaleMin)) {
+      diags.Report(clang::diag::err_drv_unsupported_option_argument)
+          << arg->getSpelling() << argValue;
+      return false;
+    }
+    opts.VScaleMin = VScaleMin;
+  }
+  if (const auto arg =
+          args.getLastArg(clang::driver::options::OPT_mvscale_max_EQ)) {
+    llvm::StringRef argValue = llvm::StringRef(arg->getValue());
+    unsigned VScaleMax;
+    if (argValue.getAsInteger(/*Radix=w*/ 10, VScaleMax)) {
+      diags.Report(clang::diag::err_drv_unsupported_option_argument)
+          << arg->getSpelling() << argValue;
+      return false;
+    }
+    opts.VScaleMax = VScaleMax;
+  }
+  return true;
+}
+
 bool CompilerInvocation::createFromArgs(
     CompilerInvocation &res, llvm::ArrayRef<const char *> commandLineArgs,
     clang::DiagnosticsEngine &diags, const char *argv0) {
@@ -1079,6 +1114,8 @@ bool CompilerInvocation::createFromArgs(
 
   success &= parseFloatingPointArgs(res, args, diags);
 
+  success &= parseVScaleArgs(res, args, diags);
+
   // Set the string to be used as the return value of the COMPILER_OPTIONS
   // intrinsic of iso_fortran_env. This is either passed in from the parent
   // compiler driver invocation with an environment variable, or failing that
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 6cc7808a30d8a0..73c00c8679c7ec 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -695,6 +695,22 @@ void CodeGenAction::lowerHLFIRToFIR() {
   }
 }
 
+// TODO: We should get this from TargetInfo. However, that depends on
+// too much of clang, so for now, replicate the functionality.
+static std::optional<std::pair<unsigned, unsigned>>
+getVScaleRange(CompilerInstance &ci,
+               const Fortran::frontend::LangOptions &langOpts) {
+  if (langOpts.VScaleMin || langOpts.VScaleMax)
+    return std::pair<unsigned, unsigned>(
+        langOpts.VScaleMin ? langOpts.VScaleMin : 1, langOpts.VScaleMax);
+
+  std::string featuresStr = getTargetFeatures(ci);
+  if (featuresStr.find("+sve") != std::string::npos)
+    return std::pair<unsigned, unsigned>(1, 16);
+
+  return std::nullopt;
+}
+
 // Lower the previously generated MLIR module into an LLVM IR module
 void CodeGenAction::generateLLVMIR() {
   assert(mlirModule && "The MLIR module has not been generated yet.");
@@ -713,10 +729,22 @@ void CodeGenAction::generateLLVMIR() {
   pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
   pm.enableVerifier(/*verifyPasses=*/true);
 
+  MLIRToLLVMPassPipelineConfig config(level, opts);
+
+  const auto targetOpts = ci.getInvocation().getTargetOpts();
+  const llvm::Triple triple(targetOpts.triple);
+
+  // Only get the vscale range if AArch64.
+  if (triple.isAArch64()) {
+    auto langOpts = ci.getInvocation().getLangOpts();
+    if (auto vsr = getVScaleRange(ci, langOpts)) {
+      config.VScaleMin = vsr->first;
+      config.VScaleMax = vsr->second;
+    }
+  }
+
   // Create the pass pipeline
-  fir::createMLIRToLLVMPassPipeline(pm, level, opts.StackArrays,
-                                    opts.Underscoring, opts.LoopVersioning,
-                                    opts.getDebugInfo());
+  fir::createMLIRToLLVMPassPipeline(pm, config);
   (void)mlir::applyPassManagerCLOptions(pm);
 
   // run the pass manager
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index ee838b3b4a546b..5ac4d822faaae5 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -137,34 +137,41 @@ struct ConstructContext {
   Fortran::lower::StatementContext &stmtCtx; // construct exit code
 };
 
-/// Helper class to generate the runtime type info global data. This data
-/// is required to describe the derived type to the runtime so that it can
-/// operate over it. It must be ensured this data will be generated for every
-/// derived type lowered in the current translated unit. However, this data
+/// Helper class to generate the runtime type info global data and the
+/// fir.type_info operations that contain the dipatch tables (if any).
+/// The type info global data is required to describe the derived type to the
+/// runtime so that it can operate over it.
+/// It must be ensured these operations will be generated for every derived type
+/// lowered in the current translated unit. However, these operations
 /// cannot be generated before FuncOp have been created for functions since the
 /// initializers may take their address (e.g for type bound procedures). This
-/// class allows registering all the required runtime type info while it is not
-/// possible to create globals, and to generate this data after function
-/// lowering.
-class RuntimeTypeInfoConverter {
+/// class allows registering all the required type info while it is not
+/// possible to create GlobalOp/TypeInfoOp, and to generate this data afte
+/// function lowering.
+class TypeInfoConverter {
   /// Store the location and symbols of derived type info to be generated.
   /// The location of the derived type instantiation is also stored because
-  /// runtime type descriptor symbol are compiler generated and cannot be mapped
-  /// to user code on their own.
-  struct TypeInfoSymbol {
+  /// runtime type descriptor symbols are compiler generated and cannot be
+  /// mapped to user code on their own.
+  struct TypeInfo {
     Fortran::semantics::SymbolRef symbol;
+    const Fortran::semantics::DerivedTypeSpec &typeSpec;
+    fir::RecordType type;
     mlir::Location loc;
   };
 
 public:
-  void registerTypeInfoSymbol(Fortran::lower::AbstractConverter &converter,
-                              mlir::Location loc,
-                              Fortran::semantics::SymbolRef typeInfoSym) {
+  void registerTypeInfo(Fortran::lower::AbstractConverter &converter,
+                        mlir::Location loc,
+                        Fortran::semantics::SymbolRef typeInfoSym,
+                        const Fortran::semantics::DerivedTypeSpec &typeSpec,
+                        fir::RecordType type) {
     if (seen.contains(typeInfoSym))
       return;
     seen.insert(typeInfoSym);
     if (!skipRegistration) {
-      registeredTypeInfoSymbols.emplace_back(TypeInfoSymbol{typeInfoSym, loc});
+      registeredTypeInfo.emplace_back(
+          TypeInfo{typeInfoSym, typeSpec, type, loc});
       return;
     }
     // Once the registration is closed, symbols cannot be added to the
@@ -172,67 +179,59 @@ class RuntimeTypeInfoConverter {
     // However, after registration is closed, it is safe to directly generate
     // the globals because all FuncOps whose addresses may be required by the
     // initializers have been generated.
-    Fortran::lower::createRuntimeTypeInfoGlobal(converter, loc,
-                                                typeInfoSym.get());
+    createTypeInfoOpAndGlobal(converter,
+                              TypeInfo{typeInfoSym, typeSpec, type, loc});
   }
 
-  void createTypeInfoGlobals(Fortran::lower::AbstractConverter &converter) {
+  void createTypeInfo(Fortran::lower::AbstractConverter &converter) {
     skipRegistration = true;
-    for (const TypeInfoSymbol &info : registeredTypeInfoSymbols)
-      Fortran::lower::createRuntimeTypeInfoGlobal(converter, info.loc,
-                                                  info.symbol.get());
-    registeredTypeInfoSymbols.clear();
+    for (const TypeInfo &info : registeredTypeInfo)
+      createTypeInfoOpAndGlobal(converter, info);
+    registeredTypeInfo.clear();
   }
 
 private:
-  /// Store the runtime type descriptors that will be required for the
-  /// derived type that have been converted to FIR derived types.
-  llvm::SmallVector<TypeInfoSymbol> registeredTypeInfoSymbols;
-  /// Create derived type runtime info global immediately without storing the
-  /// symbol in registeredTypeInfoSymbols.
-  bool skipRegistration = false;
-  /// Track symbols symbols processed during and after the registration
-  /// to avoid infinite loops between type conversions and global variable
-  /// creation.
-  llvm::SmallSetVector<Fortran::semantics::SymbolRef, 32> seen;
-};
-
-class DispatchTableConverter {
-  struct DispatchTableInfo {
-    const Fortran::semantics::DerivedTypeSpec *typeSpec;
-    mlir::Location loc;
-  };
-
-public:
-  void registerTypeSpec(Fortran::lower::AbstractConverter &converter,
-                        mlir::Location loc,
-                        const Fortran::semantics::DerivedTypeSpec *typeSpec) {
-    assert(typeSpec && "type spec is null");
-    std::string dtName = converter.mangleName(*typeSpec);
-    if (seen.contains(dtName) || dtName.find("__fortran") != std::string::npos)
-      return;
-    seen.insert(dtName);
-    registeredDispatchTableInfo.emplace_back(DispatchTableInfo{typeSpec, loc});
-  }
-
-  void createDispatchTableOps(Fortran::lower::AbstractConverter &converter) {
-    for (const DispatchTableInfo &info : registeredDispatchTableInfo) {
-      std::string dtName = converter.mangleName(*info.typeSpec);
-      const Fortran::semantics::DerivedTypeSpec *parent =
-          Fortran::evaluate::GetParentTypeSpec(*info.typeSpec);
-      fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-      fir::DispatchTableOp dt = builder.createDispatchTableOp(
-          info.loc, dtName, parent ? converter.mangleName(*parent) : "");
-      auto insertPt = builder.saveInsertionPoint();
-      const Fortran::semantics::Scope *scope = info.typeSpec->scope();
-      if (!scope)
-        scope = info.typeSpec->typeSymbol().scope();
-      Fortran::semantics::SymbolVector bindings =
-          Fortran::semantics::CollectBindings(*scope);
-
-      if (!bindings.empty())
-        builder.createBlock(&dt.getRegion());
-
+  void createTypeInfoOpAndGlobal(Fortran::lower::AbstractConverter &converter,
+                                 const TypeInfo &info) {
+    Fortran::lower::createRuntimeTypeInfoGlobal(converter, info.loc,
+                                                info.symbol.get());
+    createTypeInfoOp(converter, info);
+  }
+
+  void createTypeInfoOp(Fortran::lower::AbstractConverter &converter,
+                        const TypeInfo &info) {
+    fir::RecordType parentType{};
+    if (const Fortran::semantics::DerivedTypeSpec *parent =
+            Fortran::evaluate::GetParentTypeSpec(info.typeSpec))
+      parentType = mlir::cast<fir::RecordType>(converter.genType(*parent));
+
+    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::ModuleOp module = builder.getModule();
+    fir::TypeInfoOp dt =
+        module.lookupSymbol<fir::TypeInfoOp>(info.type.getName());
+    if (dt)
+      return; // Already created.
+    auto insertPt = builder.saveInsertionPoint();
+    builder.setInsertionPoint(module.getBody(), module.getBody()->end());
+    dt = builder.create<fir::TypeInfoOp>(info.loc, info.type, parentType);
+
+    if (!info.typeSpec.HasDefaultInitialization(/*ignoreAllocatable=*/false,
+                                                /*ignorePointer=*/false))
+      dt->setAttr(dt.getNoInitAttrName(), builder.getUnitAttr());
+    if (!info.typeSpec.HasDestruction())
+      dt->setAttr(dt.getNoDestroyAttrName(), builder.getUnitAttr());
+    if (!Fortran::semantics::MayRequireFinalization(info.typeSpec))
+      dt->setAttr(dt.getNoFinalAttrName(), builder.getUnitAttr());
+
+    const Fortran::semantics::Scope *scope = info.typeSpec.scope();
+    if (!scope)
+      scope = info.typeSpec.typeSymbol().scope();
+    assert(scope && "failed to find type scope");
+
+    Fortran::semantics::SymbolVector bindings =
+        Fortran::semantics::CollectBindings(*scope);
+    if (!bindings.empty()) {
+      builder.createBlock(&dt.getDispatchTable());
       for (const Fortran::semantics::SymbolRef &binding : bindings) {
         const auto &details =
             binding.get().get<Fortran::semantics::ProcBindingDetails>();
@@ -244,20 +243,21 @@ class DispatchTableConverter {
             info.loc, mlir::StringAttr::get(builder.getContext(), tbpName),
             mlir::SymbolRefAttr::get(builder.getContext(), bindingName));
       }
-      if (!bindings.empty())
-        builder.create<fir::FirEndOp>(info.loc);
-      builder.restoreInsertionPoint(insertPt);
+      builder.create<fir::FirEndOp>(info.loc);
     }
-    registeredDispatchTableInfo.clear();
+    builder.restoreInsertionPoint(insertPt);
   }
 
-private:
-  /// Store the semantic DerivedTypeSpec that will be required to generate the
-  /// dispatch table.
-  llvm::SmallVector<DispatchTableInfo> registeredDispatchTableInfo;
-
-  /// Track processed type specs to avoid multiple creation.
-  llvm::StringSet<> seen;
+  /// Store the front-end data that will be required to generate the type info
+  /// for the derived types that have been converted to fir.type<>.
+  llvm::SmallVector<TypeInfo> registeredTypeInfo;
+  /// Create derived type info immediately without storing the
+  /// symbol in registeredTypeInfo.
+  bool skipRegistration = false;
+  /// Track symbols symbols processed during and after the registration
+  /// to avoid infinite loops between type conversions and global variable
+  /// creation.
+  llvm::SmallSetVector<Fortran::semantics::SymbolRef, 32> seen;
 };
 
 using IncrementLoopNestInfo = llvm::SmallVector<IncrementLoopInfo, 8>;
@@ -334,13 +334,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
     /// Once all the code has been translated, create runtime type info
     /// global data structure for the derived types that have been
-    /// processed.
-    createGlobalOutsideOfFunctionLowering(
-        [&]() { runtimeTypeInfoConverter.createTypeInfoGlobals(*this); });
-
-    /// Create the dispatch tables for derived types.
+    /// processed as well as the fir.type_info operations with the
+    /// dispatch tables.
     createGlobalOutsideOfFunctionLowering(
-        [&]() { dispatchTableConverter.createDispatchTableOps(*this); });
+        [&]() { typeInfoConverter.createTypeInfo(*this); });
 
     // Create the list of any environment defaults for the runtime to set. The
     // runtime default list is only created if there is a main program to ensure
@@ -875,16 +872,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     hostAssocTuple = val;
   }
 
-  void registerRuntimeTypeInfo(
-      mlir::Location loc,
-      Fortran::lower::SymbolRef typeInfoSym) override final {
-    runtimeTypeInfoConverter.registerTypeInfoSymbol(*this, loc, typeInfoSym);
-  }
-
-  void registerDispatchTableInfo(
-      mlir::Location loc,
-      const Fortran::semantics::DerivedTypeSpec *typeSpec) override final {
-    dispatchTableConverter.registerTypeSpec(*this, loc, typeSpec);
+  void registerTypeInfo(mlir::Location loc,
+                        Fortran::lower::SymbolRef typeInfoSym,
+                        const Fortran::semantics::DerivedTypeSpec &typeSpec,
+                        fir::RecordType type) override final {
+    typeInfoConverter.registerTypeInfo(*this, loc, typeInfoSym, typeSpec, type);
   }
 
   llvm::StringRef
@@ -4811,8 +4803,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   Fortran::lower::pft::Evaluation *evalPtr = nullptr;
   Fortran::lower::SymMap localSymbols;
   Fortran::parser::CharBlock currentPosition;
-  RuntimeTypeInfoConverter runtimeTypeInfoConverter;
-  DispatchTableConverter dispatchTableConverter;
+  TypeInfoConverter typeInfoConverter;
 
   // Stack to manage object deallocation and finalization at construct exits.
   llvm::SmallVector<ConstructContext> activeConstructStack;
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 0510965a596fb0..90025ba9c687a7 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -29,6 +29,7 @@
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "mlir/IR/IRMapping.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
@@ -428,6 +429,29 @@ fir::ExtendedValue Fortran::lower::genCallOpAndResult(
   }
 
   if (allocatedResult) {
+    // The result must be optionally destroyed (if it is of a derived type
+    // that may need finalization or deallocation of the components).
+    // For an allocatable result we have to free the memory allocated
+    // for the top-level entity. Note that the Destroy calls below
+    // do not deallocate the top-level entity. The two clean-ups
+    // must be pushed in reverse order, so that the final order is:
+    //   Destroy(desc)
+    //   free(desc->base_addr)
+    allocatedResult->match(
+        [&](const fir::MutableBoxValue &box) {
+          if (box.isAllocatable()) {
+            // 9.7.3.2 point 4. Deallocate allocatable results. Note that
+            // finalization was done independently by calling
+            // genDerivedTypeDestroy above and is not triggered by this inline
+            // deallocation.
+            fir::FirOpBuilder *bldr = &converter.getFirOpBuilder();
+            stmtCtx.attachCleanup([bldr, loc, box]() {
+              fir::factory::genFreememIfAllocated(*bldr, loc, box);
+            });
+          }
+        },
+        [](const auto &) {});
+
     // 7.5.6.3 point 5. Derived-type finalization for nonpointer function.
     // Check if the derived-type is finalizable if it is a monomorphic
     // derived-type.
@@ -435,7 +459,6 @@ fir::ExtendedValue Fortran::lower::genCallOpAndResult(
     // in any cases.
     std::optional<Fortran::evaluate::DynamicType> retTy =
         caller.getCallDescription().proc().GetType();
-    bool cleanupWithDestroy = false;
     // With HLFIR lowering, isElemental must be set to true
     // if we are producing an elemental call. In this case,
     // the elemental results must not be destroyed, instead,
@@ -451,34 +474,23 @@ fir::ExtendedValue Fortran::lower::genCallOpAndResult(
           fir::runtime::genDerivedTypeDestroy(*bldr, loc,
                                               fir::getBase(*allocatedResult));
         });
-        cleanupWithDestroy = true;
       } else {
         const Fortran::semantics::DerivedTypeSpec &typeSpec =
             retTy->GetDerivedTypeSpec();
-        if (Fortran::semantics::IsFinalizable(typeSpec)) {
+        // If the result type may require finalization
+        // or have allocatable components, we need to make sure
+        // everything is properly finalized/deallocated.
+        if (Fortran::semantics::MayRequireFinalization(typeSpec) ||
+            // We can use DerivedTypeDestroy even if finalization is not needed.
+            hlfir::mayHaveAllocatableComponent(funcType.getResults()[0])) {
           auto *bldr = &converter.getFirOpBuilder();
           stmtCtx.attachCleanup([bldr, loc, allocatedResult]() {
             mlir::Value box = bldr->createBox(loc, *allocatedResult);
             fir::runtime::genDerivedTypeDestroy(*bldr, loc, box);
           });
-          cleanupWithDestroy = true;
         }
       }
     }
-    allocatedResult->match(
-        [&](const fir::MutableBoxValue &box) {
-          if (box.isAllocatable() && !cleanupWithDestroy) {
-            // 9.7.3.2 point 4. Deallocate allocatable results. Note that
-            // finalization was done independently by calling
-            // genDerivedTypeDestroy above and is not triggered by this inline
-            // deallocation.
-            fir::FirOpBuilder *bldr = &converter.getFirOpBuilder();
-            stmtCtx.attachCleanup([bldr, loc, box]() {
-              fir::factory::genFreememIfAllocated(*bldr, loc, box);
-            });
-          }
-        },
-        [](const auto &) {});
     return *allocatedResult;
   }
 
@@ -1232,8 +1244,25 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
 
   if (!fir::getBase(result))
     return std::nullopt; // subroutine call.
-  // TODO: "move" non pointer results into hlfir.expr.
-  return extendedValueToHlfirEntity(loc, builder, result, ".tmp.func_result");
+
+  hlfir::Entity resultEntity =
+      extendedValueToHlfirEntity(loc, builder, result, ".tmp.func_result");
+
+  if (!fir::isPointerType(fir::getBase(result).getType())) {
+    resultEntity = loadTrivialScalar(loc, builder, resultEntity);
+
+    if (resultEntity.isVariable()) {
+      // Function result must not be freed, since it is allocated on the stack.
+      // Note that in non-elemental case, genCallOpAndResult()
+      // is responsible for establishing the clean-up that destroys
+      // the derived type result or deallocates its components
+      // without finalization.
+      auto asExpr = builder.create<hlfir::AsExprOp>(
+          loc, resultEntity, /*mustFree=*/builder.createBool(loc, false));
+      resultEntity = hlfir::EntityWithAttributes{asExpr.getResult()};
+    }
+  }
+  return hlfir::EntityWithAttributes{resultEntity};
 }
 
 /// Create an optional dummy argument value from an entity that may be
@@ -1591,37 +1620,33 @@ class ElementalCallBuilder {
     for (unsigned i = 0; i < numArgs; ++i) {
       auto &preparedActual = loweredActuals[i];
       if (preparedActual) {
-        hlfir::Entity actual = preparedActual->getOriginalActual();
         // Elemental procedure dummy arguments cannot be pointer/allocatables
         // (C15100), so it is safe to dereference any pointer or allocatable
         // actual argument now instead of doing this inside the elemental
         // region.
-        actual = hlfir::derefPointersAndAllocatables(loc, builder, actual);
+        preparedActual->derefPointersAndAllocatables(loc, builder);
         // Better to load scalars outside of the loop when possible.
         if (!preparedActual->handleDynamicOptional() &&
             impl().canLoadActualArgumentBeforeLoop(i))
-          actual = hlfir::loadTrivialScalar(loc, builder, actual);
+          preparedActual->loadTrivialScalar(loc, builder);
         // TODO: merge shape instead of using the first one.
-        if (!shape && actual.isArray()) {
+        if (!shape && preparedActual->isArray()) {
           if (preparedActual->handleDynamicOptional())
             optionalWithShape = &*preparedActual;
           else
-            shape = hlfir::genShape(loc, builder, actual);
+            shape = preparedActual->genShape(loc, builder);
         }
         // 15.8.3 p1. Elemental procedure with intent(out)/intent(inout)
         // arguments must be called in element order.
         if (impl().argMayBeModifiedByCall(i))
           mustBeOrdered = true;
-        // Propagates pointer dereferences and scalar loads.
-        preparedActual->setOriginalActual(actual);
       }
     }
     if (!shape && optionalWithShape) {
       // If all array operands appear in optional positions, then none of them
       // is allowed to be absent as per 15.5.2.12 point 3. (6). Just pick the
       // first operand.
-      shape =
-          hlfir::genShape(loc, builder, optionalWithShape->getOriginalActual());
+      shape = optionalWithShape->genShape(loc, builder);
       // TODO: There is an opportunity to add a runtime check here that
       // this array is present as required. Also, the optionality of all actual
       // could be checked and reset given the Fortran requirement.
@@ -1635,16 +1660,10 @@ class ElementalCallBuilder {
     // intent(inout) arguments. Note that the scalar arguments are handled
     // above.
     if (mustBeOrdered) {
-      for (unsigned i = 0; i < numArgs; ++i) {
-        auto &preparedActual = loweredActuals[i];
+      for (auto &preparedActual : loweredActuals) {
         if (preparedActual) {
-          hlfir::Entity actual = preparedActual->getOriginalActual();
-          if (!actual.isVariable() && actual.isArray()) {
-            mlir::Type storageType = actual.getType();
-            hlfir::AssociateOp associate = hlfir::genAssociateExpr(
-                loc, builder, actual, storageType, "adapt.impure_arg_eval");
-            preparedActual->setOriginalActual(hlfir::Entity{associate});
-
+          if (hlfir::AssociateOp associate =
+                  preparedActual->associateIfArrayExpr(loc, builder)) {
             fir::FirOpBuilder *bldr = &builder;
             callContext.stmtCtx.attachCleanup(
                 [=]() { bldr->create<hlfir::EndAssociateOp>(loc, associate); });
@@ -1824,9 +1843,8 @@ class ElementalIntrinsicCallBuilder
     if (intrinsic)
       if (intrinsic->name == "adjustr" || intrinsic->name == "adjustl" ||
           intrinsic->name == "merge")
-        return hlfir::genCharLength(
-            callContext.loc, callContext.getBuilder(),
-            loweredActuals[0].value().getOriginalActual());
+        return loweredActuals[0].value().genCharLength(
+            callContext.loc, callContext.getBuilder());
     // Character MIN/MAX is the min/max of the arguments length that are
     // present.
     TODO(callContext.loc,
@@ -1846,7 +1864,7 @@ class ElementalIntrinsicCallBuilder
       // the same declared and dynamic types. So any of them can be used
       // for the mold.
       assert(!loweredActuals.empty());
-      return loweredActuals.front()->getOriginalActual();
+      return loweredActuals.front()->getPolymorphicMold(callContext.loc);
     }
 
     return {};
@@ -2109,7 +2127,7 @@ genProcedureRef(CallContext &callContext) {
   Fortran::lower::CallerInterface caller(callContext.procRef,
                                          callContext.converter);
   mlir::FunctionType callSiteType = caller.genFunctionType();
-
+  const bool isElemental = callContext.isElementalProcWithArrayArgs();
   Fortran::lower::PreparedActualArguments loweredActuals;
   // Lower the actual arguments
   for (const Fortran::lower::CallInterface<
@@ -2134,6 +2152,21 @@ genProcedureRef(CallContext &callContext) {
         }
       }
 
+      if (isElemental && !arg.hasValueAttribute() &&
+          Fortran::evaluate::IsVariable(*expr) &&
+          Fortran::evaluate::HasVectorSubscript(*expr)) {
+        // Vector subscripted arguments are copied in calls, except in elemental
+        // calls without VALUE attribute where Fortran 2018 15.5.2.4 point 21
+        // does not apply and the address of each element must be passed.
+        hlfir::ElementalAddrOp elementalAddr =
+            Fortran::lower::convertVectorSubscriptedExprToElementalAddr(
+                loc, callContext.converter, *expr, callContext.symMap,
+                callContext.stmtCtx);
+        loweredActuals.emplace_back(
+            Fortran::lower::PreparedActualArgument{elementalAddr});
+        continue;
+      }
+
       auto loweredActual = Fortran::lower::convertExprToHLFIR(
           loc, callContext.converter, *expr, callContext.symMap,
           callContext.stmtCtx);
@@ -2150,7 +2183,7 @@ genProcedureRef(CallContext &callContext) {
       // Optional dummy argument for which there is no actual argument.
       loweredActuals.emplace_back(std::nullopt);
     }
-  if (callContext.isElementalProcWithArrayArgs()) {
+  if (isElemental) {
     bool isImpure = false;
     if (const Fortran::semantics::Symbol *procSym =
             callContext.procRef.proc().GetSymbol())
@@ -2161,6 +2194,27 @@ genProcedureRef(CallContext &callContext) {
   return genUserCall(loweredActuals, caller, callSiteType, callContext);
 }
 
+hlfir::Entity Fortran::lower::PreparedActualArgument::getActual(
+    mlir::Location loc, fir::FirOpBuilder &builder) const {
+  if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual)) {
+    if (oneBasedElementalIndices)
+      return hlfir::getElementAt(loc, builder, *actualEntity,
+                                 *oneBasedElementalIndices);
+    return *actualEntity;
+  }
+  assert(oneBasedElementalIndices && "expect elemental context");
+  hlfir::ElementalAddrOp elementalAddr =
+      std::get<hlfir::ElementalAddrOp>(actual);
+  mlir::IRMapping mapper;
+  auto alwaysFalse = [](hlfir::ElementalOp) -> bool { return false; };
+  mlir::Value addr = hlfir::inlineElementalOp(
+      loc, builder, elementalAddr, *oneBasedElementalIndices, mapper,
+      /*mustRecursivelyInline=*/alwaysFalse);
+  assert(elementalAddr.getCleanup().empty() && "no clean-up expected");
+  elementalAddr.erase();
+  return hlfir::Entity{addr};
+}
+
 bool Fortran::lower::isIntrinsicModuleProcRef(
     const Fortran::evaluate::ProcedureRef &procRef) {
   const Fortran::semantics::Symbol *symbol = procRef.proc().GetSymbol();
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
index 940e70da511c22..6e7a60e42abfe6 100644
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -388,37 +388,38 @@ static mlir::Value genInlinedStructureCtorLitImpl(
 
     // Special handling for scalar c_ptr/c_funptr constants. The array constant
     // must fall through to genConstantValue() below.
-    if (Fortran::semantics::IsBuiltinCPtr(sym) && sym->Rank() == 0) {
-      // Builtin c_ptr and c_funptr have special handling because initial
-      // values are handled for them as an extension.
+    if (Fortran::semantics::IsBuiltinCPtr(sym) && sym->Rank() == 0 &&
+        (Fortran::evaluate::GetLastSymbol(expr.value()) ||
+         Fortran::evaluate::IsNullPointer(expr.value()))) {
+      // Builtin c_ptr and c_funptr have special handling because designators
+      // and NULL() are handled as initial values for them as an extension
+      // (otherwise only c_ptr_null/c_funptr_null are allowed and these are
+      // replaced by structure constructors by semantics, so GetLastSymbol
+      // returns nothing).
+
+      // The Ev::Expr is an initializer that is a pointer target (e.g., 'x' or
+      // NULL()) that must be inserted into an intermediate cptr record value's
+      // address field, which ought to be an intptr_t on the target.
       mlir::Value addr = fir::getBase(Fortran::lower::genExtAddrInInitializer(
           converter, loc, expr.value()));
-      if (addr.getType() == componentTy) {
-        // Do nothing. The Ev::Expr was returned as a value that can be
-        // inserted directly to the component without an intermediary.
-      } else {
-        // The Ev::Expr returned is an initializer that is a pointer (e.g.,
-        // null) that must be inserted into an intermediate cptr record
-        // value's address field, which ought to be an intptr_t on the target.
-        if (addr.getType().isa<fir::BoxProcType>())
-          addr = builder.create<fir::BoxAddrOp>(loc, addr);
-        assert((fir::isa_ref_type(addr.getType()) ||
-                addr.getType().isa<mlir::FunctionType>()) &&
-               "expect reference type for address field");
-        assert(fir::isa_derived(componentTy) &&
-               "expect C_PTR, C_FUNPTR to be a record");
-        auto cPtrRecTy = componentTy.cast<fir::RecordType>();
-        llvm::StringRef addrFieldName = Fortran::lower::builtin::cptrFieldName;
-        mlir::Type addrFieldTy = cPtrRecTy.getType(addrFieldName);
-        auto addrField = builder.create<fir::FieldIndexOp>(
-            loc, fieldTy, addrFieldName, componentTy,
-            /*typeParams=*/mlir::ValueRange{});
-        mlir::Value castAddr = builder.createConvert(loc, addrFieldTy, addr);
-        auto undef = builder.create<fir::UndefOp>(loc, componentTy);
-        addr = builder.create<fir::InsertValueOp>(
-            loc, componentTy, undef, castAddr,
-            builder.getArrayAttr(addrField.getAttributes()));
-      }
+      if (addr.getType().isa<fir::BoxProcType>())
+        addr = builder.create<fir::BoxAddrOp>(loc, addr);
+      assert((fir::isa_ref_type(addr.getType()) ||
+              addr.getType().isa<mlir::FunctionType>()) &&
+             "expect reference type for address field");
+      assert(fir::isa_derived(componentTy) &&
+             "expect C_PTR, C_FUNPTR to be a record");
+      auto cPtrRecTy = componentTy.cast<fir::RecordType>();
+      llvm::StringRef addrFieldName = Fortran::lower::builtin::cptrFieldName;
+      mlir::Type addrFieldTy = cPtrRecTy.getType(addrFieldName);
+      auto addrField = builder.create<fir::FieldIndexOp>(
+          loc, fieldTy, addrFieldName, componentTy,
+          /*typeParams=*/mlir::ValueRange{});
+      mlir::Value castAddr = builder.createConvert(loc, addrFieldTy, addr);
+      auto undef = builder.create<fir::UndefOp>(loc, componentTy);
+      addr = builder.create<fir::InsertValueOp>(
+          loc, componentTy, undef, castAddr,
+          builder.getArrayAttr(addrField.getAttributes()));
       res = builder.create<fir::InsertValueOp>(
           loc, recTy, res, addr, builder.getArrayAttr(field.getAttributes()));
       continue;
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index efaeba29b762ad..22b83efe8678bb 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -426,14 +426,12 @@ struct TypeBuilderImpl {
     }
     LLVM_DEBUG(llvm::dbgs() << "derived type: " << rec << '\n');
 
-    converter.registerDispatchTableInfo(loc, &tySpec);
-
     // Generate the type descriptor object if any
     if (const Fortran::semantics::Scope *derivedScope =
             tySpec.scope() ? tySpec.scope() : tySpec.typeSymbol().scope())
       if (const Fortran::semantics::Symbol *typeInfoSym =
               derivedScope->runtimeDerivedTypeDescription())
-        converter.registerRuntimeTypeInfo(loc, *typeInfoSym);
+        converter.registerTypeInfo(loc, *typeInfoSym, tySpec, rec);
     return rec;
   }
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 87b570d1ac4b7e..ef1f68f7e0ebc3 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -96,6 +96,17 @@ static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
   return false;
 }
 
+// Does this variable have an allocatable direct component?
+static bool
+hasAllocatableDirectComponent(const Fortran::semantics::Symbol &sym) {
+  if (sym.has<Fortran::semantics::ObjectEntityDetails>())
+    if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
+      if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
+              declTypeSpec->AsDerived())
+        return Fortran::semantics::HasAllocatableDirectComponent(
+            *derivedTypeSpec);
+  return false;
+}
 //===----------------------------------------------------------------===//
 // Global variables instantiation (not for alias and common)
 //===----------------------------------------------------------------===//
@@ -352,8 +363,9 @@ static mlir::Value genDefaultInitializerValue(
         componentValue = genDefaultInitializerValue(converter, loc, component,
                                                     componentTy, stmtCtx);
       } else {
-        // Component has no initial value.
-        componentValue = builder.create<fir::UndefOp>(loc, componentTy);
+        // Component has no initial value. Set its bits to zero by extension
+        // to match what is expected because other compilers are doing it.
+        componentValue = builder.create<fir::ZeroOp>(loc, componentTy);
       }
     } else if (const auto *proc{
                    component
@@ -361,7 +373,7 @@ static mlir::Value genDefaultInitializerValue(
       if (proc->init().has_value())
         TODO(loc, "procedure pointer component default initialization");
       else
-        componentValue = builder.create<fir::UndefOp>(loc, componentTy);
+        componentValue = builder.create<fir::ZeroOp>(loc, componentTy);
     }
     assert(componentValue && "must have been computed");
     componentValue = builder.createConvert(loc, componentTy, componentValue);
@@ -669,6 +681,15 @@ needDeallocationOrFinalization(const Fortran::lower::pft::Variable &var) {
       return VariableCleanUp::Deallocate;
     if (hasFinalization(sym))
       return VariableCleanUp::Finalize;
+    // hasFinalization() check above handled all cases that require
+    // finalization, but we also have to deallocate all allocatable
+    // components of local variables (since they are also local variables
+    // according to F18 5.4.3.2.2, p. 2, note 1).
+    // Here, the variable itself is not allocatable. If it has an allocatable
+    // component the Destroy runtime does the job. Use the Finalize clean-up,
+    // though there will be no finalization in runtime.
+    if (hasAllocatableDirectComponent(sym))
+      return VariableCleanUp::Finalize;
   }
   return std::nullopt;
 }
@@ -890,7 +911,7 @@ static fir::GlobalOp defineGlobalAggregateStore(
   Fortran::lower::createGlobalInitialization(
       builder, global, [&](fir::FirOpBuilder &builder) {
         Fortran::lower::StatementContext stmtCtx;
-        mlir::Value initVal = builder.create<fir::UndefOp>(loc, aggTy);
+        mlir::Value initVal = builder.create<fir::ZeroOp>(loc, aggTy);
         builder.create<fir::HasValueOp>(loc, initVal);
       });
   return global;
@@ -1171,7 +1192,7 @@ static void finalizeCommonBlockDefinition(
   mlir::TupleType commonTy = global.getType().cast<mlir::TupleType>();
   auto initFunc = [&](fir::FirOpBuilder &builder) {
     mlir::IndexType idxTy = builder.getIndexType();
-    mlir::Value cb = builder.create<fir::UndefOp>(loc, commonTy);
+    mlir::Value cb = builder.create<fir::ZeroOp>(loc, commonTy);
     unsigned tupIdx = 0;
     std::size_t offset = 0;
     LLVM_DEBUG(llvm::dbgs() << "block {\n");
@@ -1926,8 +1947,13 @@ void Fortran::lower::mapSymbolAttributes(
   if (ba.isChar()) {
     if (arg) {
       assert(!preAlloc && "dummy cannot be pre-allocated");
-      if (arg.getType().isa<fir::BoxCharType>())
+      if (arg.getType().isa<fir::BoxCharType>()) {
         std::tie(addr, len) = charHelp.createUnboxChar(arg);
+        // Ensure proper type is given to array/scalar that transited via
+        // fir.boxchar arg.
+        mlir::Type castTy = builder.getRefType(converter.genType(var));
+        addr = builder.createConvert(loc, castTy, addr);
+      }
     }
     if (std::optional<int64_t> cstLen = ba.getCharLenConst()) {
       // Static length
@@ -1953,13 +1979,6 @@ void Fortran::lower::mapSymbolAttributes(
     }
   }
 
-  if (addr && addr.getDefiningOp<fir::UnboxCharOp>()) {
-    // Ensure proper type is given to array/scalar that transited via
-    // fir.boxchar arg.
-    mlir::Type castTy = builder.getRefType(converter.genType(var));
-    addr = builder.createConvert(loc, castTy, addr);
-  }
-
   // Compute array extents and lower bounds.
   if (ba.isArray()) {
     if (ba.isStaticArray()) {
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 662897c9e6c01b..535ec1c03b54db 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -799,13 +799,17 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
           }
         }
       }
-      // ub = baseLb + extent - 1
       if (!ubound) {
-        mlir::Value ext =
-            fir::factory::readExtent(builder, loc, dataExv, dimension);
-        mlir::Value lbExt =
-            builder.create<mlir::arith::AddIOp>(loc, ext, baseLb);
-        ubound = builder.create<mlir::arith::SubIOp>(loc, lbExt, one);
+        extent = fir::factory::readExtent(builder, loc, dataExv, dimension);
+        if (defaultLb) {
+          // ub = extent - 1
+          ubound = builder.create<mlir::arith::SubIOp>(loc, extent, one);
+        } else {
+          // ub = baseLb + extent - 1
+          mlir::Value lbExt =
+              builder.create<mlir::arith::AddIOp>(loc, extent, baseLb);
+          ubound = builder.create<mlir::arith::SubIOp>(loc, lbExt, one);
+        }
       }
       mlir::Value bound = builder.create<BoundsOp>(
           loc, boundTy, lbound, ubound, extent, stride, strideInBytes, baseLb);
diff --git a/flang/lib/Lower/HlfirIntrinsics.cpp b/flang/lib/Lower/HlfirIntrinsics.cpp
index 20e570044e8d4a..9f764b61425226 100644
--- a/flang/lib/Lower/HlfirIntrinsics.cpp
+++ b/flang/lib/Lower/HlfirIntrinsics.cpp
@@ -152,7 +152,7 @@ mlir::Value HlfirTransformationalIntrinsic::loadBoxAddress(
   if (!arg)
     return mlir::Value{};
 
-  hlfir::Entity actual = arg->getOriginalActual();
+  hlfir::Entity actual = arg->getActual(loc, builder);
 
   if (!arg->handleDynamicOptional()) {
     if (actual.isMutableBox()) {
@@ -193,7 +193,7 @@ llvm::SmallVector<mlir::Value> HlfirTransformationalIntrinsic::getOperandVector(
       operands.emplace_back();
       continue;
     }
-    hlfir::Entity actual = arg->getOriginalActual();
+    hlfir::Entity actual = arg->getActual(loc, builder);
     mlir::Value valArg;
 
     if (!argLowering) {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index c51bdedcde6dc5..c8089f40602861 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -478,6 +478,33 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
   return recipe;
 }
 
+/// Get a string representation of the bounds.
+std::string getBoundsString(llvm::SmallVector<mlir::Value> &bounds) {
+  std::stringstream boundStr;
+  if (!bounds.empty())
+    boundStr << "_section_";
+  llvm::interleave(
+      bounds,
+      [&](mlir::Value bound) {
+        auto boundsOp =
+            mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+        if (boundsOp.getLowerbound() &&
+            fir::getIntIfConstant(boundsOp.getLowerbound()) &&
+            boundsOp.getUpperbound() &&
+            fir::getIntIfConstant(boundsOp.getUpperbound())) {
+          boundStr << "lb" << *fir::getIntIfConstant(boundsOp.getLowerbound())
+                   << ".ub" << *fir::getIntIfConstant(boundsOp.getUpperbound());
+        } else if (boundsOp.getExtent() &&
+                   fir::getIntIfConstant(boundsOp.getExtent())) {
+          boundStr << "ext" << *fir::getIntIfConstant(boundsOp.getExtent());
+        } else {
+          boundStr << "?";
+        }
+      },
+      [&] { boundStr << "x"; });
+  return boundStr.str();
+}
+
 /// Rebuild the array type from the acc.bounds operation with constant
 /// lowerbound/upperbound or extent.
 mlir::Type getTypeFromBounds(llvm::SmallVector<mlir::Value> &bounds,
@@ -653,16 +680,16 @@ bool isConstantBound(mlir::acc::DataBoundsOp &op) {
   return false;
 }
 
-/// Determine if the bounds represent a dynamic shape.
-bool hasDynamicShape(llvm::SmallVector<mlir::Value> &bounds) {
-  if (bounds.empty())
-    return false;
-  for (auto b : bounds) {
-    auto op = mlir::dyn_cast<mlir::acc::DataBoundsOp>(b.getDefiningOp());
-    if (!isConstantBound(op))
-      return true;
+/// Return true iff all the bounds are expressed with constant values.
+bool areAllBoundConstant(llvm::SmallVector<mlir::Value> &bounds) {
+  for (auto bound : bounds) {
+    auto dataBound =
+        mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+    assert(dataBound && "Must be DataBoundOp operation");
+    if (!isConstantBound(dataBound))
+      return false;
   }
-  return false;
+  return true;
 }
 
 /// Return a constant with the initial value for the reduction operator and
@@ -715,9 +742,28 @@ static mlir::Value getReductionInitValue(fir::FirOpBuilder &builder,
   if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
     return getReductionInitValue(builder, loc, boxTy.getEleTy(), op);
 
+  if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+    return getReductionInitValue(builder, loc, heapTy.getEleTy(), op);
+
+  if (auto ptrTy = mlir::dyn_cast<fir::PointerType>(ty))
+    return getReductionInitValue(builder, loc, ptrTy.getEleTy(), op);
+
   llvm::report_fatal_error("Unsupported OpenACC reduction type");
 }
 
+/// Return the nested sequence type if any.
+static mlir::Type extractSequenceType(mlir::Type ty) {
+  if (mlir::isa<fir::SequenceType>(ty))
+    return ty;
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
+    return extractSequenceType(boxTy.getEleTy());
+  if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+    return extractSequenceType(heapTy.getEleTy());
+  if (auto ptrTy = mlir::dyn_cast<fir::PointerType>(ty))
+    return extractSequenceType(ptrTy.getEleTy());
+  return mlir::Type{};
+}
+
 static mlir::Value genReductionInitRegion(fir::FirOpBuilder &builder,
                                           mlir::Location loc, mlir::Type ty,
                                           mlir::acc::ReductionOperator op) {
@@ -761,7 +807,8 @@ static mlir::Value genReductionInitRegion(fir::FirOpBuilder &builder,
       return declareOp.getBase();
     }
   } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
-    if (!mlir::isa<fir::SequenceType>(boxTy.getEleTy()))
+    mlir::Type innerTy = extractSequenceType(boxTy);
+    if (!mlir::isa<fir::SequenceType>(innerTy))
       TODO(loc, "Unsupported boxed type for reduction");
     // Create the private copy from the initial fir.box.
     hlfir::Entity source = hlfir::Entity{builder.getBlock()->getArgument(0)};
@@ -863,32 +910,98 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
   TODO(loc, "reduction operator");
 }
 
+static fir::ShapeOp
+genShapeFromBounds(mlir::Location loc, fir::FirOpBuilder &builder,
+                   const llvm::SmallVector<mlir::Value> &args) {
+  assert(args.size() % 3 == 0 && "Triplets must be a multiple of 3");
+  llvm::SmallVector<mlir::Value> extents;
+  mlir::Type idxTy = builder.getIndexType();
+  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
+  mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
+  for (unsigned i = 0; i < args.size(); i += 3) {
+    mlir::Value s1 =
+        builder.create<mlir::arith::SubIOp>(loc, args[i + 1], args[0]);
+    mlir::Value s2 = builder.create<mlir::arith::AddIOp>(loc, s1, one);
+    mlir::Value s3 = builder.create<mlir::arith::DivSIOp>(loc, s2, args[i + 2]);
+    mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::sgt, s3, zero);
+    mlir::Value ext = builder.create<mlir::arith::SelectOp>(loc, cmp, s3, zero);
+    extents.push_back(ext);
+  }
+  return builder.create<fir::ShapeOp>(loc, extents);
+}
+
+static llvm::SmallVector<mlir::Value>
+genConstantBounds(fir::FirOpBuilder &builder, mlir::Location loc,
+                  mlir::acc::DataBoundsOp &dataBound) {
+  mlir::Type idxTy = builder.getIndexType();
+  mlir::Value lb, ub, step;
+  if (dataBound.getLowerbound() &&
+      fir::getIntIfConstant(dataBound.getLowerbound()) &&
+      dataBound.getUpperbound() &&
+      fir::getIntIfConstant(dataBound.getUpperbound())) {
+    lb = builder.createIntegerConstant(
+        loc, idxTy, *fir::getIntIfConstant(dataBound.getLowerbound()));
+    ub = builder.createIntegerConstant(
+        loc, idxTy, *fir::getIntIfConstant(dataBound.getUpperbound()));
+    step = builder.createIntegerConstant(loc, idxTy, 1);
+  } else if (dataBound.getExtent()) {
+    lb = builder.createIntegerConstant(loc, idxTy, 0);
+    ub = builder.createIntegerConstant(
+        loc, idxTy, *fir::getIntIfConstant(dataBound.getExtent()) - 1);
+    step = builder.createIntegerConstant(loc, idxTy, 1);
+  } else {
+    llvm::report_fatal_error("Expect constant lb/ub or extent");
+  }
+  return {lb, ub, step};
+}
+
 static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
                         mlir::acc::ReductionOperator op, mlir::Type ty,
                         mlir::Value value1, mlir::Value value2,
-                        mlir::acc::ReductionRecipeOp &recipe) {
+                        mlir::acc::ReductionRecipeOp &recipe,
+                        llvm::SmallVector<mlir::Value> &bounds,
+                        bool allConstantBound) {
   ty = fir::unwrapRefType(ty);
 
   if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) {
     assert(!seqTy.hasDynamicExtents() &&
            "Assumed shaped array should be boxed for reduction");
     mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy());
-    unsigned nbRangeArgs = recipe.getCombinerRegion().getArguments().size() - 2;
-    assert((nbRangeArgs / 3 == seqTy.getDimension()) &&
-           "Expect 3 block arguments per dimension");
-    (void)nbRangeArgs;
     llvm::SmallVector<fir::DoLoopOp> loops;
     llvm::SmallVector<mlir::Value> ivs;
-    for (unsigned i = 2; i < recipe.getCombinerRegion().getArguments().size();
-         i += 3) {
-      mlir::Value lb = recipe.getCombinerRegion().getArgument(i);
-      mlir::Value ub = recipe.getCombinerRegion().getArgument(i + 1);
-      mlir::Value step = recipe.getCombinerRegion().getArgument(i + 2);
-      auto loop = builder.create<fir::DoLoopOp>(loc, lb, ub, step,
-                                                /*unordered=*/false);
-      builder.setInsertionPointToStart(loop.getBody());
-      loops.push_back(loop);
-      ivs.push_back(loop.getInductionVar());
+    if (allConstantBound) {
+      // Use the constant bound directly in the combiner region so they do not
+      // need to be passed as block argument.
+      for (auto bound : llvm::reverse(bounds)) {
+        auto dataBound =
+            mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+        llvm::SmallVector<mlir::Value> values =
+            genConstantBounds(builder, loc, dataBound);
+        auto loop =
+            builder.create<fir::DoLoopOp>(loc, values[0], values[1], values[2],
+                                          /*unordered=*/false);
+        builder.setInsertionPointToStart(loop.getBody());
+        loops.push_back(loop);
+        ivs.push_back(loop.getInductionVar());
+      }
+    } else {
+      // Lowerbound, upperbound and step are passed as block arguments.
+      [[maybe_unused]] unsigned nbRangeArgs =
+          recipe.getCombinerRegion().getArguments().size() - 2;
+      assert((nbRangeArgs / 3 == seqTy.getDimension()) &&
+             "Expect 3 block arguments per dimension");
+      for (unsigned i = 2; i < recipe.getCombinerRegion().getArguments().size();
+           i += 3) {
+        mlir::Value lb = recipe.getCombinerRegion().getArgument(i);
+        mlir::Value ub = recipe.getCombinerRegion().getArgument(i + 1);
+        mlir::Value step = recipe.getCombinerRegion().getArgument(i + 2);
+        auto loop = builder.create<fir::DoLoopOp>(loc, lb, ub, step,
+                                                  /*unordered=*/false);
+        builder.setInsertionPointToStart(loop.getBody());
+        loops.push_back(loop);
+        ivs.push_back(loop.getInductionVar());
+      }
     }
     auto addr1 = builder.create<fir::CoordinateOp>(loc, refTy, value1, ivs);
     auto addr2 = builder.create<fir::CoordinateOp>(loc, refTy, value2, ivs);
@@ -899,13 +1012,56 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
     builder.create<fir::StoreOp>(loc, res, addr1);
     builder.setInsertionPointAfter(loops[0]);
   } else if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
+    llvm::SmallVector<mlir::Value> tripletArgs;
+    mlir::Type innerTy = extractSequenceType(boxTy);
     fir::SequenceType seqTy =
-        mlir::dyn_cast_or_null<fir::SequenceType>(boxTy.getEleTy());
+        mlir::dyn_cast_or_null<fir::SequenceType>(innerTy);
     if (!seqTy)
       TODO(loc, "Unsupported boxed type in OpenACC reduction");
-    hlfir::Entity left = hlfir::Entity{value1};
-    hlfir::Entity right = hlfir::Entity{value2};
-    auto shape = hlfir::genShape(loc, builder, left);
+
+    if (allConstantBound) {
+      for (auto bound : llvm::reverse(bounds)) {
+        auto dataBound =
+            mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+        tripletArgs.append(genConstantBounds(builder, loc, dataBound));
+      }
+    } else {
+      assert(((recipe.getCombinerRegion().getArguments().size() - 2) / 3 ==
+              seqTy.getDimension()) &&
+             "Expect 3 block arguments per dimension");
+      for (auto arg : recipe.getCombinerRegion().getArguments().drop_front(2))
+        tripletArgs.push_back(arg);
+    }
+    auto shape = genShapeFromBounds(loc, builder, tripletArgs);
+
+    hlfir::DesignateOp::Subscripts triplets;
+    for (unsigned i = 2; i < recipe.getCombinerRegion().getArguments().size();
+         i += 3)
+      triplets.emplace_back(hlfir::DesignateOp::Triplet{
+          recipe.getCombinerRegion().getArgument(i),
+          recipe.getCombinerRegion().getArgument(i + 1),
+          recipe.getCombinerRegion().getArgument(i + 2)});
+
+    llvm::SmallVector<mlir::Value> lenParamsLeft;
+    auto leftEntity = hlfir::Entity{value1};
+    hlfir::genLengthParameters(loc, builder, leftEntity, lenParamsLeft);
+    auto leftDesignate = builder.create<hlfir::DesignateOp>(
+        loc, value1.getType(), leftEntity, /*component=*/"",
+        /*componentShape=*/mlir::Value{}, triplets,
+        /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
+        shape, lenParamsLeft);
+    auto left = hlfir::Entity{leftDesignate.getResult()};
+
+    llvm::SmallVector<mlir::Value> lenParamsRight;
+    auto rightEntity = hlfir::Entity{value2};
+    hlfir::genLengthParameters(loc, builder, rightEntity, lenParamsRight);
+    auto rightDesignate = builder.create<hlfir::DesignateOp>(
+        loc, value2.getType(), rightEntity, /*component=*/"",
+        /*componentShape=*/mlir::Value{}, triplets,
+        /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
+        shape, lenParamsRight);
+    auto right = hlfir::Entity{rightDesignate.getResult()};
+
     llvm::SmallVector<mlir::Value, 1> typeParams;
     auto genKernel = [&builder, &loc, op, seqTy, &left, &right](
                          mlir::Location l, fir::FirOpBuilder &b,
@@ -951,27 +1107,43 @@ mlir::acc::ReductionRecipeOp Fortran::lower::createOrGetReductionRecipe(
   // for the combiner if needed.
   llvm::SmallVector<mlir::Type> argsTy{ty, ty};
   llvm::SmallVector<mlir::Location> argsLoc{loc, loc};
-  for (mlir::Value bound : llvm::reverse(bounds)) {
-    auto dataBound =
-        mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
-    argsTy.push_back(dataBound.getLowerbound().getType());
-    argsLoc.push_back(dataBound.getLowerbound().getLoc());
-    argsTy.push_back(dataBound.getUpperbound().getType());
-    argsLoc.push_back(dataBound.getUpperbound().getLoc());
-    argsTy.push_back(dataBound.getStartIdx().getType());
-    argsLoc.push_back(dataBound.getStartIdx().getLoc());
+  bool allConstantBound = areAllBoundConstant(bounds);
+  if (!allConstantBound) {
+    for (mlir::Value bound : llvm::reverse(bounds)) {
+      auto dataBound =
+          mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+      argsTy.push_back(dataBound.getLowerbound().getType());
+      argsLoc.push_back(dataBound.getLowerbound().getLoc());
+      argsTy.push_back(dataBound.getUpperbound().getType());
+      argsLoc.push_back(dataBound.getUpperbound().getLoc());
+      argsTy.push_back(dataBound.getStartIdx().getType());
+      argsLoc.push_back(dataBound.getStartIdx().getLoc());
+    }
   }
   builder.createBlock(&recipe.getCombinerRegion(),
                       recipe.getCombinerRegion().end(), argsTy, argsLoc);
   builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
   mlir::Value v1 = recipe.getCombinerRegion().front().getArgument(0);
   mlir::Value v2 = recipe.getCombinerRegion().front().getArgument(1);
-  genCombiner(builder, loc, op, ty, v1, v2, recipe);
+  genCombiner(builder, loc, op, ty, v1, v2, recipe, bounds, allConstantBound);
   builder.create<mlir::acc::YieldOp>(loc, v1);
   builder.restoreInsertionPoint(crtPos);
   return recipe;
 }
 
+static bool isSupportedReductionType(mlir::Type ty) {
+  ty = fir::unwrapRefType(ty);
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
+    return isSupportedReductionType(boxTy.getEleTy());
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
+    return isSupportedReductionType(seqTy.getEleTy());
+  if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+    return isSupportedReductionType(heapTy.getEleTy());
+  if (auto ptrTy = mlir::dyn_cast<fir::PointerType>(ty))
+    return isSupportedReductionType(ptrTy.getEleTy());
+  return fir::isa_trivial(ty);
+}
+
 static void
 genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
               Fortran::lower::AbstractConverter &converter,
@@ -997,10 +1169,7 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
     if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(reductionTy))
       reductionTy = seqTy.getEleTy();
 
-    if (!fir::isa_trivial(reductionTy) &&
-        ((fir::isAllocatableType(reductionTy) ||
-          fir::isPointerType(reductionTy)) &&
-         !bounds.empty()))
+    if (!isSupportedReductionType(reductionTy))
       TODO(operandLocation, "reduction with unsupported type");
 
     auto op = createDataEntryOp<mlir::acc::ReductionOp>(
@@ -1008,11 +1177,16 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
         /*structured=*/true, /*implicit=*/false,
         mlir::acc::DataClause::acc_reduction, baseAddr.getType());
     mlir::Type ty = op.getAccPtr().getType();
+    if (!areAllBoundConstant(bounds) ||
+        fir::isAssumedShape(baseAddr.getType()) ||
+        fir::isAllocatableOrPointerArray(baseAddr.getType()))
+      ty = baseAddr.getType();
+    std::string suffix =
+        areAllBoundConstant(bounds) ? getBoundsString(bounds) : "";
     std::string recipeName = fir::getTypeAsString(
         ty, converter.getKindMap(),
-        ("reduction_" + stringifyReductionOperator(mlirOp)).str());
-    if (hasDynamicShape(bounds))
-      ty = baseAddr.getType();
+        ("reduction_" + stringifyReductionOperator(mlirOp)).str() + suffix);
+
     mlir::acc::ReductionRecipeOp recipe =
         Fortran::lower::createOrGetReductionRecipe(
             builder, recipeName, operandLocation, ty, mlirOp, bounds);
@@ -2841,9 +3015,8 @@ genACC(Fortran::lower::AbstractConverter &converter,
   std::stringstream routineOpName;
   routineOpName << accRoutinePrefix.str() << routineCounter++;
   auto routineOp = modBuilder.create<mlir::acc::RoutineOp>(
-      loc, routineOpName.str(), funcName, mlir::StringAttr{}, mlir::UnitAttr{},
-      mlir::UnitAttr{}, mlir::UnitAttr{}, mlir::UnitAttr{}, mlir::UnitAttr{},
-      mlir::UnitAttr{}, mlir::IntegerAttr{});
+      loc, routineOpName.str(), funcName, mlir::StringAttr{}, false, false,
+      false, false, false, false, mlir::IntegerAttr{});
 
   for (const Fortran::parser::AccClause &clause : clauses.v) {
     if (std::get_if<Fortran::parser::AccClause::Seq>(&clause.u)) {
diff --git a/flang/lib/Optimizer/Builder/Character.cpp b/flang/lib/Optimizer/Builder/Character.cpp
index b164282df2dc5c..41cdd9a71c735d 100644
--- a/flang/lib/Optimizer/Builder/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Character.cpp
@@ -208,7 +208,9 @@ fir::factory::CharacterExprHelper::createEmbox(const fir::CharBoxValue &box) {
     builder.create<fir::StoreOp>(loc, buff, temp);
     buff = temp;
   }
-  buff = builder.createConvert(loc, refType, buff);
+  // fir.emboxchar only accepts scalar, cast array buffer to a scalar buffer.
+  if (mlir::isa<fir::SequenceType>(fir::dyn_cast_ptrEleTy(buff.getType())))
+    buff = builder.createConvert(loc, refType, buff);
   // Convert in case the provided length is not of the integer type that must
   // be used in boxchar.
   auto len = builder.createConvert(loc, builder.getCharacterLengthType(),
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 4dde918af7fa66..a14f3106a72329 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -299,18 +299,6 @@ fir::GlobalOp fir::FirOpBuilder::createGlobal(
   return glob;
 }
 
-fir::DispatchTableOp fir::FirOpBuilder::createDispatchTableOp(
-    mlir::Location loc, llvm::StringRef name, llvm::StringRef parentName) {
-  auto module = getModule();
-  auto insertPt = saveInsertionPoint();
-  if (auto dt = module.lookupSymbol<fir::DispatchTableOp>(name))
-    return dt;
-  setInsertionPoint(module.getBody(), module.getBody()->end());
-  auto dt = create<fir::DispatchTableOp>(loc, name, mlir::Type{}, parentName);
-  restoreInsertionPoint(insertPt);
-  return dt;
-}
-
 mlir::Value
 fir::FirOpBuilder::convertWithSemantics(mlir::Location loc, mlir::Type toTy,
                                         mlir::Value val,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index d1b7f3de93b464..5bf6b87615c685 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -1026,14 +1026,15 @@ struct ConvertOpConversion : public FIROpConversion<fir::ConvertOp> {
   }
 };
 
-/// `fir.disptach_table` operation has no specific CodeGen. The operation is
-/// only used to carry information during FIR to FIR passes.
-struct DispatchTableOpConversion
-    : public FIROpConversion<fir::DispatchTableOp> {
+/// `fir.type_info` operation has no specific CodeGen. The operation is
+/// only used to carry information during FIR to FIR passes. It may be used
+/// in the future to generate the runtime type info data structures instead
+/// of generating them in lowering.
+struct TypeInfoOpConversion : public FIROpConversion<fir::TypeInfoOp> {
   using FIROpConversion::FIROpConversion;
 
   mlir::LogicalResult
-  matchAndRewrite(fir::DispatchTableOp op, OpAdaptor,
+  matchAndRewrite(fir::TypeInfoOp op, OpAdaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     rewriter.eraseOp(op);
     return mlir::success();
@@ -3787,19 +3788,19 @@ class FIRToLLVMLowering
         BoxProcHostOpConversion, BoxRankOpConversion, BoxTypeCodeOpConversion,
         BoxTypeDescOpConversion, CallOpConversion, CmpcOpConversion,
         ConstcOpConversion, ConvertOpConversion, CoordinateOpConversion,
-        DispatchTableOpConversion, DTEntryOpConversion, DivcOpConversion,
-        EmboxOpConversion, EmboxCharOpConversion, EmboxProcOpConversion,
-        ExtractValueOpConversion, FieldIndexOpConversion, FirEndOpConversion,
-        FreeMemOpConversion, GlobalLenOpConversion, GlobalOpConversion,
-        HasValueOpConversion, InsertOnRangeOpConversion,
-        InsertValueOpConversion, IsPresentOpConversion,
-        LenParamIndexOpConversion, LoadOpConversion, MulcOpConversion,
-        NegcOpConversion, NoReassocOpConversion, SelectCaseOpConversion,
-        SelectOpConversion, SelectRankOpConversion, SelectTypeOpConversion,
-        ShapeOpConversion, ShapeShiftOpConversion, ShiftOpConversion,
-        SliceOpConversion, StoreOpConversion, StringLitOpConversion,
-        SubcOpConversion, TypeDescOpConversion, UnboxCharOpConversion,
-        UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion,
+        DTEntryOpConversion, DivcOpConversion, EmboxOpConversion,
+        EmboxCharOpConversion, EmboxProcOpConversion, ExtractValueOpConversion,
+        FieldIndexOpConversion, FirEndOpConversion, FreeMemOpConversion,
+        GlobalLenOpConversion, GlobalOpConversion, HasValueOpConversion,
+        InsertOnRangeOpConversion, InsertValueOpConversion,
+        IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion,
+        MulcOpConversion, NegcOpConversion, NoReassocOpConversion,
+        SelectCaseOpConversion, SelectOpConversion, SelectRankOpConversion,
+        SelectTypeOpConversion, ShapeOpConversion, ShapeShiftOpConversion,
+        ShiftOpConversion, SliceOpConversion, StoreOpConversion,
+        StringLitOpConversion, SubcOpConversion, TypeDescOpConversion,
+        TypeInfoOpConversion, UnboxCharOpConversion, UnboxProcOpConversion,
+        UndefOpConversion, UnreachableOpConversion,
         UnrealizedConversionCastOpConversion, XArrayCoorOpConversion,
         XEmboxOpConversion, XReboxOpConversion, ZeroOpConversion>(typeConverter,
                                                                   options);
diff --git a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
index 04259c0e4c7213..1a5ae8cf7aac62 100644
--- a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
+++ b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
@@ -92,6 +92,10 @@ TBAATagAttr TBAABuilder::getDataAccessTag(Type baseFIRType, Type accessFIRType,
   return getAnyDataAccessTag();
 }
 
+TBAATagAttr TBAABuilder::getAnyAccessTag() {
+  return getAccessTag(anyAccessTypeDesc, anyAccessTypeDesc, /*offset=*/0);
+}
+
 void TBAABuilder::attachTBAATag(AliasAnalysisOpInterface op, Type baseFIRType,
                                 Type accessFIRType, GEPOp gep) {
   if (!enableTBAA)
@@ -106,10 +110,17 @@ void TBAABuilder::attachTBAATag(AliasAnalysisOpInterface op, Type baseFIRType,
                           << "\n");
 
   TBAATagAttr tbaaTagSym;
-  if (baseFIRType.isa<fir::BaseBoxType>())
+  if (fir::isRecordWithDescriptorMember(baseFIRType)) {
+    // A memory access that addresses an aggregate that contains
+    // a mix of data members and descriptor members may alias
+    // with both data and descriptor accesses.
+    // Conservatively set any-access tag if there is any descriptor member.
+    tbaaTagSym = getAnyAccessTag();
+  } else if (baseFIRType.isa<fir::BaseBoxType>()) {
     tbaaTagSym = getBoxAccessTag(baseFIRType, accessFIRType, gep);
-  else
+  } else {
     tbaaTagSym = getDataAccessTag(baseFIRType, accessFIRType, gep);
+  }
 
   if (!tbaaTagSym)
     return;
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 962b87acd5a805..d60e5e9657ea0f 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -1161,76 +1161,37 @@ mlir::FunctionType fir::DispatchOp::getFunctionType() {
 }
 
 //===----------------------------------------------------------------------===//
-// DispatchTableOp
+// TypeInfoOp
 //===----------------------------------------------------------------------===//
 
-mlir::ParseResult fir::DispatchTableOp::parse(mlir::OpAsmParser &parser,
-                                              mlir::OperationState &result) {
-  // Parse the name as a symbol reference attribute.
-  mlir::StringAttr nameAttr;
-  if (parser.parseSymbolName(nameAttr, mlir::SymbolTable::getSymbolAttrName(),
-                             result.attributes))
-    return mlir::failure();
-
-  if (!failed(parser.parseOptionalKeyword(getExtendsKeyword()))) {
-    mlir::StringAttr parent;
-    if (parser.parseLParen() ||
-        parser.parseAttribute(parent, getParentAttrNameStr(),
-                              result.attributes) ||
-        parser.parseRParen())
-      return mlir::failure();
-  }
-
-  // Parse the optional table body.
-  mlir::Region *body = result.addRegion();
-  mlir::OptionalParseResult parseResult = parser.parseOptionalRegion(*body);
-  if (parseResult.has_value() && failed(*parseResult))
-    return mlir::failure();
-
-  fir::DispatchTableOp::ensureTerminator(*body, parser.getBuilder(),
-                                         result.location);
-  return mlir::success();
+void fir::TypeInfoOp::build(mlir::OpBuilder &builder,
+                            mlir::OperationState &result, fir::RecordType type,
+                            fir::RecordType parentType,
+                            llvm::ArrayRef<mlir::NamedAttribute> attrs) {
+  result.addRegion();
+  result.addAttribute(mlir::SymbolTable::getSymbolAttrName(),
+                      builder.getStringAttr(type.getName()));
+  result.addAttribute(getTypeAttrName(result.name), mlir::TypeAttr::get(type));
+  if (parentType)
+    result.addAttribute(getParentTypeAttrName(result.name),
+                        mlir::TypeAttr::get(parentType));
+  result.addAttributes(attrs);
 }
 
-void fir::DispatchTableOp::print(mlir::OpAsmPrinter &p) {
-  p << ' ';
-  p.printSymbolName(getSymName());
-  if (getParent())
-    p << ' ' << getExtendsKeyword() << '('
-      << (*this)->getAttr(getParentAttrNameStr()) << ')';
+mlir::LogicalResult fir::TypeInfoOp::verify() {
+  if (!getDispatchTable().empty())
+    for (auto &op : getDispatchTable().front().without_terminator())
+      if (!mlir::isa<fir::DTEntryOp>(op))
+        return op.emitOpError("dispatch table must contain dt_entry");
 
-  mlir::Region &body = getOperation()->getRegion(0);
-  if (!body.empty()) {
-    p << ' ';
-    p.printRegion(body, /*printEntryBlockArgs=*/false,
-                  /*printBlockTerminators=*/false);
-  }
-}
+  if (!mlir::isa<fir::RecordType>(getType()))
+    return emitOpError("type must be a fir.type");
 
-mlir::LogicalResult fir::DispatchTableOp::verify() {
-  if (getRegion().empty())
-    return mlir::success();
-  for (auto &op : getBlock())
-    if (!mlir::isa<fir::DTEntryOp, fir::FirEndOp>(op))
-      return op.emitOpError("dispatch table must contain dt_entry");
+  if (getParentType() && !mlir::isa<fir::RecordType>(*getParentType()))
+    return emitOpError("parent_type must be a fir.type");
   return mlir::success();
 }
 
-void fir::DispatchTableOp::build(mlir::OpBuilder &builder,
-                                 mlir::OperationState &result,
-                                 llvm::StringRef name, mlir::Type type,
-                                 llvm::StringRef parent,
-                                 llvm::ArrayRef<mlir::NamedAttribute> attrs) {
-  result.addRegion();
-  result.addAttribute(mlir::SymbolTable::getSymbolAttrName(),
-                      builder.getStringAttr(name));
-  if (!parent.empty())
-    result.addAttribute(getParentAttrNameStr(), builder.getStringAttr(parent));
-  // result.addAttribute(getSymbolAttrNameStr(),
-  //                     mlir::SymbolRefAttr::get(builder.getContext(), name));
-  result.addAttributes(attrs);
-}
-
 //===----------------------------------------------------------------------===//
 // EmboxOp
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index dd79aa27645452..323b589cc7f2eb 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -311,6 +311,25 @@ bool isAssumedType(mlir::Type ty) {
   return false;
 }
 
+bool isAssumedShape(mlir::Type ty) {
+  if (auto boxTy = mlir::dyn_cast<fir::BoxType>(ty))
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy()))
+      return seqTy.hasDynamicExtents();
+  return false;
+}
+
+bool isAllocatableOrPointerArray(mlir::Type ty) {
+  if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
+    ty = refTy;
+  if (auto boxTy = mlir::dyn_cast<fir::BoxType>(ty)) {
+    if (auto heapTy = mlir::dyn_cast<fir::HeapType>(boxTy.getEleTy()))
+      return mlir::isa<fir::SequenceType>(heapTy.getEleTy());
+    if (auto ptrTy = mlir::dyn_cast<fir::PointerType>(boxTy.getEleTy()))
+      return mlir::isa<fir::SequenceType>(ptrTy.getEleTy());
+  }
+  return false;
+}
+
 bool isPolymorphicType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
@@ -360,6 +379,18 @@ bool isRecordWithAllocatableMember(mlir::Type ty) {
   return false;
 }
 
+bool isRecordWithDescriptorMember(mlir::Type ty) {
+  ty = unwrapSequenceType(ty);
+  if (auto recTy = ty.dyn_cast<fir::RecordType>())
+    for (auto [field, memTy] : recTy.getTypeList()) {
+      if (mlir::isa<fir::BaseBoxType>(memTy))
+        return true;
+      if (memTy.isa<fir::RecordType>() && isRecordWithDescriptorMember(memTy))
+        return true;
+    }
+  return false;
+}
+
 mlir::Type unwrapAllRefAndSeqType(mlir::Type ty) {
   while (true) {
     mlir::Type nt = unwrapSequenceType(unwrapRefType(ty));
@@ -520,7 +551,7 @@ std::string getTypeAsString(mlir::Type ty, const fir::KindMapping &kindMap,
     } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(ty)) {
       for (auto extent : seqTy.getShape()) {
         if (extent == fir::SequenceType::getUnknownExtent())
-          name << "?x";
+          name << "Ux";
         else
           name << extent << 'x';
       }
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index 3ddaf1f2af8fdd..3da8666d7c53f7 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -791,26 +791,35 @@ struct ElementalOpConversion
     // Assign the element value to the temp element for this iteration.
     auto tempElement =
         hlfir::getElementAt(loc, builder, temp, loopNest.oneBasedIndices);
-    // FIXME: if the elemental result is a function result temporary
-    // of a derived type, we have to make sure that we are either
-    // deallocate any allocatable/automatic components after the assignment
-    // or that we do not do the deep copy with the AssignOp. The latter
-    // seems to be preferrable, because the deep copy is more expensive.
-    // The shallow copy may be done with a load/store of the RecordType scalar.
-    builder.create<hlfir::AssignOp>(loc, elementValue, tempElement,
-                                    /*realloc=*/false,
-                                    /*keep_lhs_length_if_realloc=*/false,
-                                    /*temporary_lhs=*/true);
-    // hlfir.yield_element implicitly marks the end-of-life its operand if
-    // it is an expression created in the hlfir.elemental (since it is its
-    // last use and an hlfir.destroy could not be created afterwards)
-    // Now that this node has been removed and the expression has been used in
-    // the assign, insert an hlfir.destroy to mark the expression end-of-life.
-    // If the expression creation allocated a buffer on the heap inside the
-    // loop, this will ensure the buffer properly deallocated.
-    if (elementValue.getType().isa<hlfir::ExprType>() &&
-        wasCreatedInCurrentBlock(elementValue, builder))
-      builder.create<hlfir::DestroyOp>(loc, elementValue);
+    // If the elemental result is a temporary of a derived type,
+    // we can avoid the deep copy implied by the AssignOp and just
+    // do the shallow copy with load/store. This helps avoiding the overhead
+    // of deallocating allocatable components of the temporary (if any)
+    // on each iteration of the elemental operation.
+    auto asExpr = elementValue.getDefiningOp<hlfir::AsExprOp>();
+    auto elemType = hlfir::getFortranElementType(elementValue.getType());
+    if (asExpr && asExpr.isMove() && mlir::isa<fir::RecordType>(elemType) &&
+        hlfir::mayHaveAllocatableComponent(elemType) &&
+        wasCreatedInCurrentBlock(elementValue, builder)) {
+      auto load = builder.create<fir::LoadOp>(loc, asExpr.getVar());
+      builder.create<fir::StoreOp>(loc, load, tempElement);
+    } else {
+      builder.create<hlfir::AssignOp>(loc, elementValue, tempElement,
+                                      /*realloc=*/false,
+                                      /*keep_lhs_length_if_realloc=*/false,
+                                      /*temporary_lhs=*/true);
+
+      // hlfir.yield_element implicitly marks the end-of-life its operand if
+      // it is an expression created in the hlfir.elemental (since it is its
+      // last use and an hlfir.destroy could not be created afterwards)
+      // Now that this node has been removed and the expression has been used in
+      // the assign, insert an hlfir.destroy to mark the expression end-of-life.
+      // If the expression creation allocated a buffer on the heap inside the
+      // loop, this will ensure the buffer properly deallocated.
+      if (elementValue.getType().isa<hlfir::ExprType>() &&
+          wasCreatedInCurrentBlock(elementValue, builder))
+        builder.create<hlfir::DestroyOp>(loc, elementValue);
+    }
     builder.restoreInsertionPoint(insPt);
 
     mlir::Value bufferizedExpr =
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 3d2b7e5eaeade0..428c4c2a1e6440 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -19,6 +19,7 @@ add_flang_library(FIRTransforms
   OMPEarlyOutlining.cpp
   OMPFunctionFiltering.cpp
   OMPMarkDeclareTarget.cpp
+  VScaleAttr.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
index f5d0602c270cc7..93efea434cb12e 100644
--- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
@@ -70,7 +70,7 @@ class SelectTypeConv : public OpConversionPattern<fir::SelectTypeOp> {
                                         mlir::PatternRewriter &rewriter,
                                         fir::KindMapping &kindMap) const;
 
-  llvm::SmallSet<llvm::StringRef, 4> collectAncestors(fir::DispatchTableOp dt,
+  llvm::SmallSet<llvm::StringRef, 4> collectAncestors(fir::TypeInfoOp dt,
                                                       mlir::ModuleOp mod) const;
 
   // Mutex used to guard insertion of mlir::func::FuncOp in the module.
@@ -305,7 +305,7 @@ mlir::LogicalResult SelectTypeConv::matchAndRewrite(
 
     if (auto a = typeGuards[t].dyn_cast<fir::SubclassAttr>()) {
       if (auto recTy = a.getType().dyn_cast<fir::RecordType>()) {
-        auto dt = mod.lookupSymbol<fir::DispatchTableOp>(recTy.getName());
+        auto dt = mod.lookupSymbol<fir::TypeInfoOp>(recTy.getName());
         assert(dt && "dispatch table not found");
         llvm::SmallSet<llvm::StringRef, 4> ancestors =
             collectAncestors(dt, mod);
@@ -462,14 +462,12 @@ SelectTypeConv::genTypeDescCompare(mlir::Location loc, mlir::Value selector,
 }
 
 llvm::SmallSet<llvm::StringRef, 4>
-SelectTypeConv::collectAncestors(fir::DispatchTableOp dt,
-                                 mlir::ModuleOp mod) const {
+SelectTypeConv::collectAncestors(fir::TypeInfoOp dt, mlir::ModuleOp mod) const {
   llvm::SmallSet<llvm::StringRef, 4> ancestors;
-  if (!dt.getParent().has_value())
-    return ancestors;
-  while (dt.getParent().has_value()) {
-    ancestors.insert(*dt.getParent());
-    dt = mod.lookupSymbol<fir::DispatchTableOp>(*dt.getParent());
+  while (auto parentName = dt.getIfParentName()) {
+    ancestors.insert(*parentName);
+    dt = mod.lookupSymbol<fir::TypeInfoOp>(*parentName);
+    assert(dt && "parent type info not generated");
   }
   return ancestors;
 }
diff --git a/flang/lib/Optimizer/Transforms/VScaleAttr.cpp b/flang/lib/Optimizer/Transforms/VScaleAttr.cpp
new file mode 100644
index 00000000000000..601a937de37be9
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/VScaleAttr.cpp
@@ -0,0 +1,90 @@
+//===- VScaleAttr.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass adds a `vscale_range` attribute to function definitions.
+/// The attribute is used for scalable vector operations on Arm processors
+/// and should only be run on processors that support this feature. [It is
+/// likely harmless to run it on something else, but it is also not valuable].
+//===----------------------------------------------------------------------===//
+
+#include "flang/ISO_Fortran_binding_wrapper.h"
+#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/Inquiry.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+
+namespace fir {
+#define GEN_PASS_DECL_VSCALEATTR
+#define GEN_PASS_DEF_VSCALEATTR
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "vscale-attr"
+
+namespace {
+
+class VScaleAttrPass : public fir::impl::VScaleAttrBase<VScaleAttrPass> {
+public:
+  VScaleAttrPass(const fir::VScaleAttrOptions &options) {
+    vscaleRange = options.vscaleRange;
+  }
+  VScaleAttrPass() {}
+  void runOnOperation() override;
+};
+
+} // namespace
+
+void VScaleAttrPass::runOnOperation() {
+  LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
+  mlir::func::FuncOp func = getOperation();
+
+  LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n");
+
+  auto context = &getContext();
+
+  auto intTy = mlir::IntegerType::get(context, 32);
+
+  assert(vscaleRange.first && "VScaleRange minimum should be non-zero");
+
+  func->setAttr("vscale_range",
+                mlir::LLVM::VScaleRangeAttr::get(
+                    context, mlir::IntegerAttr::get(intTy, vscaleRange.first),
+                    mlir::IntegerAttr::get(intTy, vscaleRange.second)));
+
+  LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
+}
+
+std::unique_ptr<mlir::Pass>
+fir::createVScaleAttrPass(std::pair<unsigned, unsigned> vscaleAttr) {
+  VScaleAttrOptions opts;
+  opts.vscaleRange = vscaleAttr;
+  return std::make_unique<VScaleAttrPass>(opts);
+}
+
+std::unique_ptr<mlir::Pass> fir::createVScaleAttrPass() {
+  return std::make_unique<VScaleAttrPass>();
+}
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 7c5574541dc2a8..b642cb7678c679 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1734,7 +1734,7 @@ bool OmpStructureChecker::IsOperatorValid(const T &node, const D &variable) {
     }
     return common::HasMember<T, AllowedBinaryOperators>;
   }
-  return true;
+  return false;
 }
 
 void OmpStructureChecker::CheckAtomicCaptureStmt(
@@ -1780,9 +1780,12 @@ void OmpStructureChecker::CheckAtomicUpdateStmt(
     const parser::AssignmentStmt &assignment) {
   const auto &expr{std::get<parser::Expr>(assignment.t)};
   const auto &var{std::get<parser::Variable>(assignment.t)};
+  bool isIntrinsicProcedure{false};
+  bool isValidOperator{false};
   common::visit(
       common::visitors{
           [&](const common::Indirection<parser::FunctionReference> &x) {
+            isIntrinsicProcedure = true;
             const auto &procedureDesignator{
                 std::get<parser::ProcedureDesignator>(x.value().v.t)};
             const parser::Name *name{
@@ -1794,46 +1797,61 @@ void OmpStructureChecker::CheckAtomicUpdateStmt(
               context_.Say(expr.source,
                   "Invalid intrinsic procedure name in "
                   "OpenMP ATOMIC (UPDATE) statement"_err_en_US);
-            } else if (name) {
-              bool foundMatch{false};
-              if (auto varDesignatorIndirection =
-                      std::get_if<Fortran::common::Indirection<
-                          Fortran::parser::Designator>>(&var.u)) {
-                const auto &varDesignator = varDesignatorIndirection->value();
-                if (const auto *dataRef = std::get_if<Fortran::parser::DataRef>(
-                        &varDesignator.u)) {
-                  if (const auto *name =
-                          std::get_if<Fortran::parser::Name>(&dataRef->u)) {
-                    const auto &varSymbol = *name->symbol;
-                    if (const auto *e{GetExpr(context_, expr)}) {
-                      for (const Symbol &symbol :
-                          evaluate::CollectSymbols(*e)) {
-                        if (symbol == varSymbol) {
-                          foundMatch = true;
-                          break;
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-              if (!foundMatch) {
-                context_.Say(expr.source,
-                    "Atomic update variable '%s' not found in the "
-                    "argument list of intrinsic procedure"_err_en_US,
-                    var.GetSource().ToString());
-              }
             }
           },
           [&](const auto &x) {
             if (!IsOperatorValid(x, var)) {
               context_.Say(expr.source,
-                  "Invalid operator in OpenMP ATOMIC (UPDATE) "
+                  "Invalid or missing operator in atomic update "
                   "statement"_err_en_US);
-            }
+            } else
+              isValidOperator = true;
           },
       },
       expr.u);
+  if (const auto *e{GetExpr(context_, expr)}) {
+    const auto *v{GetExpr(context_, var)};
+    if (e->Rank() != 0)
+      context_.Say(expr.source,
+          "Expected scalar expression "
+          "on the RHS of atomic update assignment "
+          "statement"_err_en_US);
+    if (v->Rank() != 0)
+      context_.Say(var.GetSource(),
+          "Expected scalar variable "
+          "on the LHS of atomic update assignment "
+          "statement"_err_en_US);
+    const Symbol &varSymbol = evaluate::GetSymbolVector(*v).front();
+    int numOfSymbolMatches{0};
+    SymbolVector exprSymbols = evaluate::GetSymbolVector(*e);
+    for (const Symbol &symbol : exprSymbols) {
+      if (varSymbol == symbol)
+        numOfSymbolMatches++;
+    }
+    if (isIntrinsicProcedure) {
+      std::string varName = var.GetSource().ToString();
+      if (numOfSymbolMatches != 1)
+        context_.Say(expr.source,
+            "Intrinsic procedure"
+            " arguments in atomic update statement"
+            " must have exactly one occurence of '%s'"_err_en_US,
+            varName);
+      else if (varSymbol != exprSymbols.front() &&
+          varSymbol != exprSymbols.back())
+        context_.Say(expr.source,
+            "Atomic update statement "
+            "should be of the form `%s = intrinsic_procedure(%s, expr_list)` "
+            "OR `%s = intrinsic_procedure(expr_list, %s)`"_err_en_US,
+            varName, varName, varName, varName);
+    } else if (isValidOperator) {
+      if (numOfSymbolMatches != 1)
+        context_.Say(expr.source,
+            "Exactly one occurence of '%s' "
+            "expected on the RHS of atomic update assignment statement"_err_en_US,
+            var.GetSource().ToString());
+    }
+  }
+
   ErrIfAllocatableVariable(var);
 }
 
@@ -2148,6 +2166,7 @@ CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare)
 CHECK_SIMPLE_CLAUSE(CancellationConstructType, OMPC_cancellation_construct_type)
 CHECK_SIMPLE_CLAUSE(Doacross, OMPC_doacross)
 CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
+CHECK_SIMPLE_CLAUSE(OmpxBare, OMPC_ompx_bare)
 
 CHECK_REQ_SCALAR_INT_CLAUSE(Grainsize, OMPC_grainsize)
 CHECK_REQ_SCALAR_INT_CLAUSE(NumTasks, OMPC_num_tasks)
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 25ffcb68eaf87a..7d6ab2c83cc595 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -841,6 +841,11 @@ bool MayRequireFinalization(const DerivedTypeSpec &derived) {
       FindPolymorphicAllocatableUltimateComponent(derived);
 }
 
+bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
+  DirectComponentIterator directs{derived};
+  return std::any_of(directs.begin(), directs.end(), IsAllocatable);
+}
+
 bool IsAssumedLengthCharacter(const Symbol &symbol) {
   if (const DeclTypeSpec * type{symbol.GetType()}) {
     return type->category() == DeclTypeSpec::Character &&
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 51f1e16be778b8..f75daa373705f0 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -6,14 +6,10 @@
 #
 #===------------------------------------------------------------------------===#
 
-# TODO: Maybe this file should still be added by flang/CMakeLists.txt and
-# when FLANG_STANDALONE_BUILD=On and a new variable FLANG_BUILD_RUNTIME=On
-# we should invoke an ExternalProject_Add(flang-rt ...) from here?
-
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   cmake_minimum_required(VERSION 3.20.0)
 
-  project(FortranRuntime C CXX)
+  project(FlangRuntime C CXX)
 
   set(CMAKE_CXX_STANDARD 17)
   set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
@@ -86,6 +82,8 @@ append(${NO_LTO_FLAGS} CMAKE_CXX_FLAGS)
 add_definitions(-U_GLIBCXX_ASSERTIONS)
 add_definitions(-U_LIBCPP_ENABLE_ASSERTIONS)
 
+add_subdirectory(FortranMain)
+
 set(sources
   ISO_Fortran_binding.cpp
   allocatable.cpp
@@ -152,10 +150,16 @@ option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
 
 # List of files that are buildable for all devices.
 set(supported_files
+  ISO_Fortran_binding.cpp
+  assign.cpp
+  derived.cpp
   descriptor.cpp
+  stat.cpp
   terminator.cpp
+  tools.cpp
   transformational.cpp
   type-code.cpp
+  type-info.cpp
   )
 
 if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
@@ -270,13 +274,10 @@ if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "off")
   endif()
 endif()
 
-add_compile_options(-fPIC)
-
-add_flang_library(FortranRuntime STATIC
+add_flang_library(FortranRuntime
   ${sources}
   LINK_LIBS
-  FortranDecimalRT
+  FortranDecimal
 
   INSTALL_WITH_TOOLCHAIN
-  PIC
 )
diff --git a/flang/runtime/ISO_Fortran_binding.cpp b/flang/runtime/ISO_Fortran_binding.cpp
index 45b4d0ae3f569a..15743be88d1beb 100644
--- a/flang/runtime/ISO_Fortran_binding.cpp
+++ b/flang/runtime/ISO_Fortran_binding.cpp
@@ -19,7 +19,9 @@
 namespace Fortran::ISO {
 extern "C" {
 
-void *CFI_address(
+RT_EXT_API_GROUP_BEGIN
+
+RT_API_ATTRS void *CFI_address(
     const CFI_cdesc_t *descriptor, const CFI_index_t subscripts[]) {
   char *p{static_cast<char *>(descriptor->base_addr)};
   const CFI_rank_t rank{descriptor->rank};
@@ -30,8 +32,9 @@ void *CFI_address(
   return p;
 }
 
-int CFI_allocate(CFI_cdesc_t *descriptor, const CFI_index_t lower_bounds[],
-    const CFI_index_t upper_bounds[], std::size_t elem_len) {
+RT_API_ATTRS int CFI_allocate(CFI_cdesc_t *descriptor,
+    const CFI_index_t lower_bounds[], const CFI_index_t upper_bounds[],
+    std::size_t elem_len) {
   if (!descriptor) {
     return CFI_INVALID_DESCRIPTOR;
   }
@@ -81,7 +84,7 @@ int CFI_allocate(CFI_cdesc_t *descriptor, const CFI_index_t lower_bounds[],
   return CFI_SUCCESS;
 }
 
-int CFI_deallocate(CFI_cdesc_t *descriptor) {
+RT_API_ATTRS int CFI_deallocate(CFI_cdesc_t *descriptor) {
   if (!descriptor) {
     return CFI_INVALID_DESCRIPTOR;
   }
@@ -101,7 +104,7 @@ int CFI_deallocate(CFI_cdesc_t *descriptor) {
   return CFI_SUCCESS;
 }
 
-int CFI_establish(CFI_cdesc_t *descriptor, void *base_addr,
+RT_API_ATTRS int CFI_establish(CFI_cdesc_t *descriptor, void *base_addr,
     CFI_attribute_t attribute, CFI_type_t type, std::size_t elem_len,
     CFI_rank_t rank, const CFI_index_t extents[]) {
   int cfiStatus{VerifyEstablishParameters(descriptor, base_addr, attribute,
@@ -121,7 +124,7 @@ int CFI_establish(CFI_cdesc_t *descriptor, void *base_addr,
   return CFI_SUCCESS;
 }
 
-int CFI_is_contiguous(const CFI_cdesc_t *descriptor) {
+RT_API_ATTRS int CFI_is_contiguous(const CFI_cdesc_t *descriptor) {
   CFI_index_t bytes = descriptor->elem_len;
   for (int j{0}; j < descriptor->rank; ++j) {
     if (bytes != descriptor->dim[j].sm) {
@@ -132,7 +135,7 @@ int CFI_is_contiguous(const CFI_cdesc_t *descriptor) {
   return 1;
 }
 
-int CFI_section(CFI_cdesc_t *result, const CFI_cdesc_t *source,
+RT_API_ATTRS int CFI_section(CFI_cdesc_t *result, const CFI_cdesc_t *source,
     const CFI_index_t lower_bounds[], const CFI_index_t upper_bounds[],
     const CFI_index_t strides[]) {
   CFI_index_t extent[CFI_MAX_RANK];
@@ -208,7 +211,7 @@ int CFI_section(CFI_cdesc_t *result, const CFI_cdesc_t *source,
   return CFI_SUCCESS;
 }
 
-int CFI_select_part(CFI_cdesc_t *result, const CFI_cdesc_t *source,
+RT_API_ATTRS int CFI_select_part(CFI_cdesc_t *result, const CFI_cdesc_t *source,
     std::size_t displacement, std::size_t elem_len) {
   if (!result || !source) {
     return CFI_INVALID_DESCRIPTOR;
@@ -243,7 +246,7 @@ int CFI_select_part(CFI_cdesc_t *result, const CFI_cdesc_t *source,
   return CFI_SUCCESS;
 }
 
-int CFI_setpointer(CFI_cdesc_t *result, const CFI_cdesc_t *source,
+RT_API_ATTRS int CFI_setpointer(CFI_cdesc_t *result, const CFI_cdesc_t *source,
     const CFI_index_t lower_bounds[]) {
   if (!result) {
     return CFI_INVALID_DESCRIPTOR;
@@ -285,5 +288,7 @@ int CFI_setpointer(CFI_cdesc_t *result, const CFI_cdesc_t *source,
   }
   return CFI_SUCCESS;
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::ISO
diff --git a/flang/runtime/assign-impl.h b/flang/runtime/assign-impl.h
index 0cc3aab432fc25..f07a501d1d1263 100644
--- a/flang/runtime/assign-impl.h
+++ b/flang/runtime/assign-impl.h
@@ -17,7 +17,8 @@ class Terminator;
 // Note that if allocate object and source expression have the same rank, the
 // value of the allocate object becomes the value provided; otherwise the value
 // of each element of allocate object becomes the value provided (9.7.1.2(7)).
-void DoFromSourceAssign(Descriptor &, const Descriptor &, Terminator &);
+RT_API_ATTRS void DoFromSourceAssign(
+    Descriptor &, const Descriptor &, Terminator &);
 
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_ASSIGN_IMPL_H_
diff --git a/flang/runtime/assign.cpp b/flang/runtime/assign.cpp
index 3a7ade9421ccf2..458a1ba006b690 100644
--- a/flang/runtime/assign.cpp
+++ b/flang/runtime/assign.cpp
@@ -30,7 +30,7 @@ enum AssignFlags {
 
 // Predicate: is the left-hand side of an assignment an allocated allocatable
 // that must be deallocated?
-static inline bool MustDeallocateLHS(
+static inline RT_API_ATTRS bool MustDeallocateLHS(
     Descriptor &to, const Descriptor &from, Terminator &terminator, int flags) {
   // Top-level assignments to allocatable variables (*not* components)
   // may first deallocate existing content if there's about to be a
@@ -83,7 +83,7 @@ static inline bool MustDeallocateLHS(
 
 // Utility: allocate the allocatable left-hand side, either because it was
 // originally deallocated or because it required reallocation
-static int AllocateAssignmentLHS(
+static RT_API_ATTRS int AllocateAssignmentLHS(
     Descriptor &to, const Descriptor &from, Terminator &terminator, int flags) {
   to.raw().type = from.raw().type;
   if (!(flags & ExplicitLengthCharacterLHS)) {
@@ -118,7 +118,7 @@ static int AllocateAssignmentLHS(
 }
 
 // least <= 0, most >= 0
-static void MaximalByteOffsetRange(
+static RT_API_ATTRS void MaximalByteOffsetRange(
     const Descriptor &desc, std::int64_t &least, std::int64_t &most) {
   least = most = 0;
   if (desc.ElementBytes() == 0) {
@@ -140,15 +140,15 @@ static void MaximalByteOffsetRange(
   most += desc.ElementBytes() - 1;
 }
 
-static inline bool RangesOverlap(const char *aStart, const char *aEnd,
-    const char *bStart, const char *bEnd) {
+static inline RT_API_ATTRS bool RangesOverlap(const char *aStart,
+    const char *aEnd, const char *bStart, const char *bEnd) {
   return aEnd >= bStart && bEnd >= aStart;
 }
 
 // Predicate: could the left-hand and right-hand sides of the assignment
 // possibly overlap in memory?  Note that the descriptors themeselves
 // are included in the test.
-static bool MayAlias(const Descriptor &x, const Descriptor &y) {
+static RT_API_ATTRS bool MayAlias(const Descriptor &x, const Descriptor &y) {
   const char *xBase{x.OffsetElement()};
   const char *yBase{y.OffsetElement()};
   if (!xBase || !yBase) {
@@ -176,7 +176,7 @@ static bool MayAlias(const Descriptor &x, const Descriptor &y) {
   return true;
 }
 
-static void DoScalarDefinedAssignment(const Descriptor &to,
+static RT_API_ATTRS void DoScalarDefinedAssignment(const Descriptor &to,
     const Descriptor &from, const typeInfo::SpecialBinding &special) {
   bool toIsDesc{special.IsArgDescriptor(0)};
   bool fromIsDesc{special.IsArgDescriptor(1)};
@@ -200,7 +200,7 @@ static void DoScalarDefinedAssignment(const Descriptor &to,
   }
 }
 
-static void DoElementalDefinedAssignment(const Descriptor &to,
+static RT_API_ATTRS void DoElementalDefinedAssignment(const Descriptor &to,
     const Descriptor &from, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special) {
   SubscriptValue toAt[maxRank], fromAt[maxRank];
@@ -221,15 +221,16 @@ static void DoElementalDefinedAssignment(const Descriptor &to,
 }
 
 template <typename CHAR>
-static void BlankPadCharacterAssignment(Descriptor &to, const Descriptor &from,
-    SubscriptValue toAt[], SubscriptValue fromAt[], std::size_t elements,
-    std::size_t toElementBytes, std::size_t fromElementBytes) {
+static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
+    const Descriptor &from, SubscriptValue toAt[], SubscriptValue fromAt[],
+    std::size_t elements, std::size_t toElementBytes,
+    std::size_t fromElementBytes) {
   std::size_t padding{(toElementBytes - fromElementBytes) / sizeof(CHAR)};
   std::size_t copiedCharacters{fromElementBytes / sizeof(CHAR)};
   for (; elements-- > 0;
        to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
     CHAR *p{to.Element<CHAR>(toAt)};
-    std::memmove(
+    Fortran::runtime::memmove(
         p, from.Element<std::add_const_t<CHAR>>(fromAt), fromElementBytes);
     p += copiedCharacters;
     for (auto n{padding}; n-- > 0;) {
@@ -249,7 +250,7 @@ static void BlankPadCharacterAssignment(Descriptor &to, const Descriptor &from,
 // of elements, but their shape need not to conform (the assignment is done in
 // element sequence order). This facilitates some internal usages, like when
 // dealing with array constructors.
-static void Assign(
+RT_API_ATTRS static void Assign(
     Descriptor &to, const Descriptor &from, Terminator &terminator, int flags) {
   bool mustDeallocateLHS{(flags & DeallocateLHS) ||
       MustDeallocateLHS(to, from, terminator, flags)};
@@ -418,7 +419,7 @@ static void Assign(
           std::size_t componentByteSize{comp.SizeInBytes(to)};
           for (std::size_t j{0}; j < toElements; ++j,
                to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-            std::memmove(to.Element<char>(toAt) + comp.offset(),
+            Fortran::runtime::memmove(to.Element<char>(toAt) + comp.offset(),
                 from.Element<const char>(fromAt) + comp.offset(),
                 componentByteSize);
           }
@@ -428,7 +429,7 @@ static void Assign(
         std::size_t componentByteSize{comp.SizeInBytes(to)};
         for (std::size_t j{0}; j < toElements; ++j,
              to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-          std::memmove(to.Element<char>(toAt) + comp.offset(),
+          Fortran::runtime::memmove(to.Element<char>(toAt) + comp.offset(),
               from.Element<const char>(fromAt) + comp.offset(),
               componentByteSize);
         }
@@ -479,14 +480,14 @@ static void Assign(
           *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
       for (std::size_t j{0}; j < toElements; ++j, to.IncrementSubscripts(toAt),
            from.IncrementSubscripts(fromAt)) {
-        std::memmove(to.Element<char>(toAt) + procPtr.offset,
+        Fortran::runtime::memmove(to.Element<char>(toAt) + procPtr.offset,
             from.Element<const char>(fromAt) + procPtr.offset,
             sizeof(typeInfo::ProcedurePointer));
       }
     }
   } else { // intrinsic type, intrinsic assignment
     if (isSimpleMemmove()) {
-      std::memmove(to.raw().base_addr, from.raw().base_addr,
+      Fortran::runtime::memmove(to.raw().base_addr, from.raw().base_addr,
           toElements * toElementBytes);
     } else if (toElementBytes > fromElementBytes) { // blank padding
       switch (to.type().raw()) {
@@ -510,8 +511,8 @@ static void Assign(
     } else { // elemental copies, possibly with character truncation
       for (std::size_t n{toElements}; n-- > 0;
            to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-        std::memmove(to.Element<char>(toAt), from.Element<const char>(fromAt),
-            toElementBytes);
+        Fortran::runtime::memmove(to.Element<char>(toAt),
+            from.Element<const char>(fromAt), toElementBytes);
       }
     }
   }
@@ -523,7 +524,9 @@ static void Assign(
   }
 }
 
-void DoFromSourceAssign(
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS void DoFromSourceAssign(
     Descriptor &alloc, const Descriptor &source, Terminator &terminator) {
   if (alloc.rank() > 0 && source.rank() == 0) {
     // The value of each element of allocate object becomes the value of source.
@@ -542,8 +545,8 @@ void DoFromSourceAssign(
     } else { // intrinsic type
       for (std::size_t n{alloc.Elements()}; n-- > 0;
            alloc.IncrementSubscripts(allocAt)) {
-        std::memmove(alloc.Element<char>(allocAt), source.raw().base_addr,
-            alloc.ElementBytes());
+        Fortran::runtime::memmove(alloc.Element<char>(allocAt),
+            source.raw().base_addr, alloc.ElementBytes());
       }
     }
   } else {
@@ -551,8 +554,12 @@ void DoFromSourceAssign(
   }
 }
 
+RT_OFFLOAD_API_GROUP_END
+
 extern "C" {
-void RTNAME(Assign)(Descriptor &to, const Descriptor &from,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(Assign)(Descriptor &to, const Descriptor &from,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   // All top-level defined assignments can be recognized in semantics and
@@ -562,7 +569,7 @@ void RTNAME(Assign)(Descriptor &to, const Descriptor &from,
       MaybeReallocate | NeedFinalization | ComponentCanBeDefinedAssignment);
 }
 
-void RTNAME(AssignTemporary)(Descriptor &to, const Descriptor &from,
+void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   // Initialize the "to" if it is of derived type that needs initialization.
@@ -591,7 +598,7 @@ void RTNAME(AssignTemporary)(Descriptor &to, const Descriptor &from,
   Assign(to, from, terminator, PolymorphicLHS);
 }
 
-void RTNAME(CopyOutAssign)(Descriptor &to, const Descriptor &from,
+void RTDEF(CopyOutAssign)(Descriptor &to, const Descriptor &from,
     bool skipToInit, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   // Initialize the "to" if it is of derived type that needs initialization.
@@ -613,7 +620,7 @@ void RTNAME(CopyOutAssign)(Descriptor &to, const Descriptor &from,
   Assign(to, from, terminator, NoAssignFlags);
 }
 
-void RTNAME(AssignExplicitLengthCharacter)(Descriptor &to,
+void RTDEF(AssignExplicitLengthCharacter)(Descriptor &to,
     const Descriptor &from, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Assign(to, from, terminator,
@@ -621,12 +628,14 @@ void RTNAME(AssignExplicitLengthCharacter)(Descriptor &to,
           ExplicitLengthCharacterLHS);
 }
 
-void RTNAME(AssignPolymorphic)(Descriptor &to, const Descriptor &from,
+void RTDEF(AssignPolymorphic)(Descriptor &to, const Descriptor &from,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Assign(to, from, terminator,
       MaybeReallocate | NeedFinalization | ComponentCanBeDefinedAssignment |
           PolymorphicLHS);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/derived.cpp b/flang/runtime/derived.cpp
index 6e87e010df2eda..8a0d0ab2bb7836 100644
--- a/flang/runtime/derived.cpp
+++ b/flang/runtime/derived.cpp
@@ -15,8 +15,11 @@
 
 namespace Fortran::runtime {
 
-int Initialize(const Descriptor &instance, const typeInfo::DerivedType &derived,
-    Terminator &terminator, bool hasStat, const Descriptor *errMsg) {
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS int Initialize(const Descriptor &instance,
+    const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat,
+    const Descriptor *errMsg) {
   const Descriptor &componentDesc{derived.component()};
   std::size_t elements{instance.Elements()};
   int stat{StatOk};
@@ -114,7 +117,7 @@ int Initialize(const Descriptor &instance, const typeInfo::DerivedType &derived,
   return stat;
 }
 
-static const typeInfo::SpecialBinding *FindFinal(
+static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
     const typeInfo::DerivedType &derived, int rank) {
   if (const auto *ranked{derived.FindSpecialBinding(
           typeInfo::SpecialBinding::RankFinal(rank))}) {
@@ -128,7 +131,7 @@ static const typeInfo::SpecialBinding *FindFinal(
   }
 }
 
-static void CallFinalSubroutine(const Descriptor &descriptor,
+static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
   if (const auto *special{FindFinal(derived, descriptor.rank())}) {
     if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) {
@@ -193,7 +196,7 @@ static void CallFinalSubroutine(const Descriptor &descriptor,
 }
 
 // Fortran 2018 subclause 7.5.6.2
-void Finalize(const Descriptor &descriptor,
+RT_API_ATTRS void Finalize(const Descriptor &descriptor,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
   if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) {
     return;
@@ -285,7 +288,7 @@ void Finalize(const Descriptor &descriptor,
 // elementwise finalization of non-parent components taking place
 // before parent component finalization, and with all finalization
 // preceding any deallocation.
-void Destroy(const Descriptor &descriptor, bool finalize,
+RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
   if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) {
     return;
@@ -313,7 +316,7 @@ void Destroy(const Descriptor &descriptor, bool finalize,
   }
 }
 
-bool HasDynamicComponent(const Descriptor &descriptor) {
+RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) {
   if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
     if (const auto *derived = addendum->derivedType()) {
       const Descriptor &componentDesc{derived->component()};
@@ -331,4 +334,5 @@ bool HasDynamicComponent(const Descriptor &descriptor) {
   return false;
 }
 
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index 6acfb8a532d30d..28248f76e882af 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -10,7 +10,9 @@
 #define FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
 
 #include "flang/Runtime/api-attrs.h"
+#include "flang/Runtime/c-or-cpp.h"
 #include <algorithm>
+#include <cstring>
 
 // The file defines a set of utilities/classes that might be
 // used to get reduce the dependency on external libraries (e.g. libstdc++).
@@ -20,6 +22,21 @@
 #define STD_FILL_N_UNSUPPORTED 1
 #endif
 
+#if !defined(STD_MEMMOVE_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_MEMMOVE_UNSUPPORTED 1
+#endif
+
+#if !defined(STD_STRLEN_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_STRLEN_UNSUPPORTED 1
+#endif
+
+#if !defined(STD_MEMCMP_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_MEMCMP_UNSUPPORTED 1
+#endif
+
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
@@ -28,16 +45,78 @@ namespace Fortran::runtime {
 template <typename A>
 static inline RT_API_ATTRS void fill_n(
     A *start, std::size_t count, const A &value) {
-#if STD_FILL_N_UNSUPPORTED
-  for (std::size_t j{0}; j < count; ++j)
+  for (std::size_t j{0}; j < count; ++j) {
     start[j] = value;
-#else
-  std::fill_n(start, count, value);
-#endif
+  }
 }
 #else // !STD_FILL_N_UNSUPPORTED
 using std::fill_n;
 #endif // !STD_FILL_N_UNSUPPORTED
 
+#if STD_MEMMOVE_UNSUPPORTED
+// Provides alternative implementation for std::memmove(), if
+// it is not supported.
+static inline RT_API_ATTRS void memmove(
+    void *dest, const void *src, std::size_t count) {
+  char *to{reinterpret_cast<char *>(dest)};
+  const char *from{reinterpret_cast<const char *>(src)};
+
+  if (to == from) {
+    return;
+  }
+  if (to + count <= from || from + count <= to) {
+    std::memcpy(dest, src, count);
+  } else if (to < from) {
+    while (count--) {
+      *to++ = *from++;
+    }
+  } else {
+    to += count;
+    from += count;
+    while (count--) {
+      *--to = *--from;
+    }
+  }
+}
+#else // !STD_MEMMOVE_UNSUPPORTED
+using std::memmove;
+#endif // !STD_MEMMOVE_UNSUPPORTED
+
+#if STD_STRLEN_UNSUPPORTED
+// Provides alternative implementation for std::strlen(), if
+// it is not supported.
+static inline RT_API_ATTRS std::size_t strlen(const char *str) {
+  if (!str) {
+    // Return 0 for nullptr.
+    return 0;
+  }
+  const char *end = str;
+  for (; *end != '\0'; ++end)
+    ;
+  return end - str;
+}
+#else // !STD_STRLEN_UNSUPPORTED
+using std::strlen;
+#endif // !STD_STRLEN_UNSUPPORTED
+
+#if STD_MEMCMP_UNSUPPORTED
+// Provides alternative implementation for std::memcmp(), if
+// it is not supported.
+static inline RT_API_ATTRS int memcmp(
+    const void *RESTRICT lhs, const void *RESTRICT rhs, std::size_t count) {
+  auto m1{reinterpret_cast<const unsigned char *>(lhs)};
+  auto m2{reinterpret_cast<const unsigned char *>(rhs)};
+  for (; count--; ++m1, ++m2) {
+    int diff = *m1 - *m2;
+    if (diff != 0) {
+      return diff;
+    }
+  }
+  return 0;
+}
+#else // !STD_MEMCMP_UNSUPPORTED
+using std::memcmp;
+#endif // !STD_MEMCMP_UNSUPPORTED
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/stat.cpp b/flang/runtime/stat.cpp
index 63284bbea7f231..24368fa6a1ae1a 100644
--- a/flang/runtime/stat.cpp
+++ b/flang/runtime/stat.cpp
@@ -8,10 +8,13 @@
 
 #include "stat.h"
 #include "terminator.h"
+#include "tools.h"
 #include "flang/Runtime/descriptor.h"
 
 namespace Fortran::runtime {
-const char *StatErrorString(int stat) {
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS const char *StatErrorString(int stat) {
   switch (stat) {
   case StatOk:
     return "No error";
@@ -68,14 +71,14 @@ const char *StatErrorString(int stat) {
   }
 }
 
-int ToErrmsg(const Descriptor *errmsg, int stat) {
+RT_API_ATTRS int ToErrmsg(const Descriptor *errmsg, int stat) {
   if (stat != StatOk && errmsg && errmsg->raw().base_addr &&
       errmsg->type() == TypeCode(TypeCategory::Character, 1) &&
       errmsg->rank() == 0) {
     if (const char *msg{StatErrorString(stat)}) {
       char *buffer{errmsg->OffsetElement()};
       std::size_t bufferLength{errmsg->ElementBytes()};
-      std::size_t msgLength{std::strlen(msg)};
+      std::size_t msgLength{Fortran::runtime::strlen(msg)};
       if (msgLength >= bufferLength) {
         std::memcpy(buffer, msg, bufferLength);
       } else {
@@ -87,7 +90,7 @@ int ToErrmsg(const Descriptor *errmsg, int stat) {
   return stat;
 }
 
-int ReturnError(
+RT_API_ATTRS int ReturnError(
     Terminator &terminator, int stat, const Descriptor *errmsg, bool hasStat) {
   if (stat == StatOk || hasStat) {
     return ToErrmsg(errmsg, stat);
@@ -98,4 +101,6 @@ int ReturnError(
   }
   return stat;
 }
+
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/stat.h b/flang/runtime/stat.h
index 7ba797c3741869..e2b0658b122c97 100644
--- a/flang/runtime/stat.h
+++ b/flang/runtime/stat.h
@@ -12,6 +12,7 @@
 #ifndef FORTRAN_RUNTIME_STAT_H_
 #define FORTRAN_RUNTIME_STAT_H_
 #include "flang/ISO_Fortran_binding_wrapper.h"
+#include "flang/Runtime/api-attrs.h"
 #include "flang/Runtime/magic-numbers.h"
 namespace Fortran::runtime {
 
@@ -52,9 +53,9 @@ enum Stat {
       FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE,
 };
 
-const char *StatErrorString(int);
-int ToErrmsg(const Descriptor *errmsg, int stat); // returns stat
-int ReturnError(Terminator &, int stat, const Descriptor *errmsg = nullptr,
-    bool hasStat = false);
+RT_API_ATTRS const char *StatErrorString(int);
+RT_API_ATTRS int ToErrmsg(const Descriptor *errmsg, int stat); // returns stat
+RT_API_ATTRS int ReturnError(Terminator &, int stat,
+    const Descriptor *errmsg = nullptr, bool hasStat = false);
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_STAT_H
diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp
index 6ca6620c16cde0..c3c14829638462 100644
--- a/flang/runtime/sum.cpp
+++ b/flang/runtime/sum.cpp
@@ -138,15 +138,10 @@ CppTypeFor<TypeCategory::Real, 10> RTNAME(SumReal10)(const Descriptor &x,
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-#if HAS_FLOAT128
-using AccumType = __float128;
-#else // if LDBL_MANT_DIG == 113
-using AccumType = long double;
-#endif
 CppTypeFor<TypeCategory::Real, 16> RTNAME(SumReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 16>(
-      x, source, line, dim, mask, RealSumAccumulator<AccumType>{x}, "SUM");
+      x, source, line, dim, mask, RealSumAccumulator<long double>{x}, "SUM");
 }
 #endif
 
diff --git a/flang/runtime/tools.cpp b/flang/runtime/tools.cpp
index 36cfa456a08234..a027559d9f4a74 100644
--- a/flang/runtime/tools.cpp
+++ b/flang/runtime/tools.cpp
@@ -15,14 +15,16 @@
 
 namespace Fortran::runtime {
 
-std::size_t TrimTrailingSpaces(const char *s, std::size_t n) {
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS std::size_t TrimTrailingSpaces(const char *s, std::size_t n) {
   while (n > 0 && s[n - 1] == ' ') {
     --n;
   }
   return n;
 }
 
-OwningPtr<char> SaveDefaultCharacter(
+RT_API_ATTRS OwningPtr<char> SaveDefaultCharacter(
     const char *s, std::size_t length, const Terminator &terminator) {
   if (s) {
     auto *p{static_cast<char *>(AllocateMemoryOrCrash(terminator, length + 1))};
@@ -34,7 +36,7 @@ OwningPtr<char> SaveDefaultCharacter(
   }
 }
 
-static bool CaseInsensitiveMatch(
+static RT_API_ATTRS bool CaseInsensitiveMatch(
     const char *value, std::size_t length, const char *possibility) {
   for (; length-- > 0; ++possibility) {
     char ch{*value++};
@@ -57,7 +59,7 @@ static bool CaseInsensitiveMatch(
   return *possibility == '\0';
 }
 
-int IdentifyValue(
+RT_API_ATTRS int IdentifyValue(
     const char *value, std::size_t length, const char *possibilities[]) {
   if (value) {
     for (int j{0}; possibilities[j]; ++j) {
@@ -69,9 +71,9 @@ int IdentifyValue(
   return -1;
 }
 
-void ToFortranDefaultCharacter(
+RT_API_ATTRS void ToFortranDefaultCharacter(
     char *to, std::size_t toLength, const char *from) {
-  std::size_t len{std::strlen(from)};
+  std::size_t len{Fortran::runtime::strlen(from)};
   if (len < toLength) {
     std::memcpy(to, from, len);
     std::memset(to + len, ' ', toLength - len);
@@ -80,7 +82,7 @@ void ToFortranDefaultCharacter(
   }
 }
 
-void CheckConformability(const Descriptor &to, const Descriptor &x,
+RT_API_ATTRS void CheckConformability(const Descriptor &to, const Descriptor &x,
     Terminator &terminator, const char *funcName, const char *toName,
     const char *xName) {
   if (x.rank() == 0) {
@@ -104,14 +106,15 @@ void CheckConformability(const Descriptor &to, const Descriptor &x,
   }
 }
 
-void CheckIntegerKind(Terminator &terminator, int kind, const char *intrinsic) {
+RT_API_ATTRS void CheckIntegerKind(
+    Terminator &terminator, int kind, const char *intrinsic) {
   if (kind < 1 || kind > 16 || (kind & (kind - 1)) != 0) {
     terminator.Crash(
         "not yet implemented: %s: KIND=%d argument", intrinsic, kind);
   }
 }
 
-void ShallowCopyDiscontiguousToDiscontiguous(
+RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
     const Descriptor &to, const Descriptor &from) {
   SubscriptValue toAt[maxRank], fromAt[maxRank];
   to.GetLowerBounds(toAt);
@@ -124,7 +127,7 @@ void ShallowCopyDiscontiguousToDiscontiguous(
   }
 }
 
-void ShallowCopyDiscontiguousToContiguous(
+RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
     const Descriptor &to, const Descriptor &from) {
   char *toAt{to.OffsetElement()};
   SubscriptValue fromAt[maxRank];
@@ -136,7 +139,7 @@ void ShallowCopyDiscontiguousToContiguous(
   }
 }
 
-void ShallowCopyContiguousToDiscontiguous(
+RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
     const Descriptor &to, const Descriptor &from) {
   SubscriptValue toAt[maxRank];
   to.GetLowerBounds(toAt);
@@ -148,7 +151,7 @@ void ShallowCopyContiguousToDiscontiguous(
   }
 }
 
-void ShallowCopy(const Descriptor &to, const Descriptor &from,
+RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
     bool toIsContiguous, bool fromIsContiguous) {
   if (toIsContiguous) {
     if (fromIsContiguous) {
@@ -166,7 +169,9 @@ void ShallowCopy(const Descriptor &to, const Descriptor &from,
   }
 }
 
-void ShallowCopy(const Descriptor &to, const Descriptor &from) {
+RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from) {
   ShallowCopy(to, from, to.IsContiguous(), from.IsContiguous());
 }
+
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index 34ee8c56aa962a..ea659190e14391 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -23,20 +23,20 @@ namespace Fortran::runtime {
 
 class Terminator;
 
-std::size_t TrimTrailingSpaces(const char *, std::size_t);
+RT_API_ATTRS std::size_t TrimTrailingSpaces(const char *, std::size_t);
 
-OwningPtr<char> SaveDefaultCharacter(
+RT_API_ATTRS OwningPtr<char> SaveDefaultCharacter(
     const char *, std::size_t, const Terminator &);
 
 // For validating and recognizing default CHARACTER values in a
 // case-insensitive manner.  Returns the zero-based index into the
 // null-terminated array of upper-case possibilities when the value is valid,
 // or -1 when it has no match.
-int IdentifyValue(
+RT_API_ATTRS int IdentifyValue(
     const char *value, std::size_t length, const char *possibilities[]);
 
 // Truncates or pads as necessary
-void ToFortranDefaultCharacter(
+RT_API_ATTRS void ToFortranDefaultCharacter(
     char *to, std::size_t toLength, const char *from);
 
 // Utility for dealing with elemental LOGICAL arguments
@@ -59,8 +59,8 @@ RT_API_ATTRS void CheckConformability(const Descriptor &to, const Descriptor &x,
 
 // Helper to store integer value in result[at].
 template <int KIND> struct StoreIntegerAt {
-  void operator()(const Fortran::runtime::Descriptor &result, std::size_t at,
-      std::int64_t value) const {
+  RT_API_ATTRS void operator()(const Fortran::runtime::Descriptor &result,
+      std::size_t at, std::int64_t value) const {
     *result.ZeroBasedIndexedElement<Fortran::runtime::CppTypeFor<
         Fortran::common::TypeCategory::Integer, KIND>>(at) = value;
   }
@@ -71,7 +71,8 @@ RT_API_ATTRS void CheckIntegerKind(
     Terminator &, int kind, const char *intrinsic);
 
 template <typename TO, typename FROM>
-inline void PutContiguousConverted(TO *to, FROM *from, std::size_t count) {
+inline RT_API_ATTRS void PutContiguousConverted(
+    TO *to, FROM *from, std::size_t count) {
   while (count-- > 0) {
     *to++ = *from++;
   }
@@ -94,7 +95,7 @@ static inline RT_API_ATTRS std::int64_t GetInt64(
 }
 
 template <typename INT>
-inline bool SetInteger(INT &x, int kind, std::int64_t value) {
+inline RT_API_ATTRS bool SetInteger(INT &x, int kind, std::int64_t value) {
   switch (kind) {
   case 1:
     reinterpret_cast<CppTypeFor<TypeCategory::Integer, 1> &>(x) = value;
@@ -300,8 +301,8 @@ inline RT_API_ATTRS RESULT ApplyLogicalKind(
 }
 
 // Calculate result type of (X op Y) for *, //, DOT_PRODUCT, &c.
-std::optional<std::pair<TypeCategory, int>> inline constexpr GetResultType(
-    TypeCategory xCat, int xKind, TypeCategory yCat, int yKind) {
+std::optional<std::pair<TypeCategory, int>> inline constexpr RT_API_ATTRS
+GetResultType(TypeCategory xCat, int xKind, TypeCategory yCat, int yKind) {
   int maxKind{std::max(xKind, yKind)};
   switch (xCat) {
   case TypeCategory::Integer:
@@ -379,7 +380,7 @@ using AccumulationType = CppTypeFor<CAT,
 
 // memchr() for any character type
 template <typename CHAR>
-static inline const CHAR *FindCharacter(
+static inline RT_API_ATTRS const CHAR *FindCharacter(
     const CHAR *data, CHAR ch, std::size_t chars) {
   const CHAR *end{data + chars};
   for (const CHAR *p{data}; p < end; ++p) {
@@ -391,7 +392,8 @@ static inline const CHAR *FindCharacter(
 }
 
 template <>
-inline const char *FindCharacter(const char *data, char ch, std::size_t chars) {
+inline RT_API_ATTRS const char *FindCharacter(
+    const char *data, char ch, std::size_t chars) {
   return reinterpret_cast<const char *>(
       std::memchr(data, static_cast<int>(ch), chars));
 }
@@ -399,15 +401,15 @@ inline const char *FindCharacter(const char *data, char ch, std::size_t chars) {
 // Copy payload data from one allocated descriptor to another.
 // Assumes element counts and element sizes match, and that both
 // descriptors are allocated.
-void ShallowCopyDiscontiguousToDiscontiguous(
+RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
     const Descriptor &to, const Descriptor &from);
-void ShallowCopyDiscontiguousToContiguous(
+RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
     const Descriptor &to, const Descriptor &from);
-void ShallowCopyContiguousToDiscontiguous(
+RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
     const Descriptor &to, const Descriptor &from);
-void ShallowCopy(const Descriptor &to, const Descriptor &from,
+RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
     bool toIsContiguous, bool fromIsContiguous);
-void ShallowCopy(const Descriptor &to, const Descriptor &from);
+RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from);
 
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_TOOLS_H_
diff --git a/flang/runtime/type-info.cpp b/flang/runtime/type-info.cpp
index 5bd0258cbbf7ff..baf446e0c79d3c 100644
--- a/flang/runtime/type-info.cpp
+++ b/flang/runtime/type-info.cpp
@@ -8,11 +8,14 @@
 
 #include "type-info.h"
 #include "terminator.h"
+#include "tools.h"
 #include <cstdio>
 
 namespace Fortran::runtime::typeInfo {
 
-std::optional<TypeParameterValue> Value::GetValue(
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS std::optional<TypeParameterValue> Value::GetValue(
     const Descriptor *descriptor) const {
   switch (genre_) {
   case Genre::Explicit:
@@ -29,7 +32,8 @@ std::optional<TypeParameterValue> Value::GetValue(
   }
 }
 
-std::size_t Component::GetElementByteSize(const Descriptor &instance) const {
+RT_API_ATTRS std::size_t Component::GetElementByteSize(
+    const Descriptor &instance) const {
   switch (category()) {
   case TypeCategory::Integer:
   case TypeCategory::Real:
@@ -51,7 +55,8 @@ std::size_t Component::GetElementByteSize(const Descriptor &instance) const {
   return 0;
 }
 
-std::size_t Component::GetElements(const Descriptor &instance) const {
+RT_API_ATTRS std::size_t Component::GetElements(
+    const Descriptor &instance) const {
   std::size_t elements{1};
   if (int rank{rank_}) {
     if (const Value * boundValues{bounds()}) {
@@ -73,7 +78,8 @@ std::size_t Component::GetElements(const Descriptor &instance) const {
   return elements;
 }
 
-std::size_t Component::SizeInBytes(const Descriptor &instance) const {
+RT_API_ATTRS std::size_t Component::SizeInBytes(
+    const Descriptor &instance) const {
   if (genre() == Genre::Data) {
     return GetElementByteSize(instance) * GetElements(instance);
   } else if (category() == TypeCategory::Derived) {
@@ -85,7 +91,7 @@ std::size_t Component::SizeInBytes(const Descriptor &instance) const {
   }
 }
 
-void Component::EstablishDescriptor(Descriptor &descriptor,
+RT_API_ATTRS void Component::EstablishDescriptor(Descriptor &descriptor,
     const Descriptor &container, Terminator &terminator) const {
   ISO::CFI_attribute_t attribute{static_cast<ISO::CFI_attribute_t>(
       genre_ == Genre::Allocatable   ? CFI_attribute_allocatable
@@ -128,7 +134,7 @@ void Component::EstablishDescriptor(Descriptor &descriptor,
   }
 }
 
-void Component::CreatePointerDescriptor(Descriptor &descriptor,
+RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
     const Descriptor &container, Terminator &terminator,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
@@ -141,7 +147,7 @@ void Component::CreatePointerDescriptor(Descriptor &descriptor,
   descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
-const DerivedType *DerivedType::GetParentType() const {
+RT_API_ATTRS const DerivedType *DerivedType::GetParentType() const {
   if (hasParent_) {
     const Descriptor &compDesc{component()};
     const Component &component{*compDesc.OffsetElement<const Component>()};
@@ -151,7 +157,7 @@ const DerivedType *DerivedType::GetParentType() const {
   }
 }
 
-const Component *DerivedType::FindDataComponent(
+RT_API_ATTRS const Component *DerivedType::FindDataComponent(
     const char *compName, std::size_t compNameLen) const {
   const Descriptor &compDesc{component()};
   std::size_t n{compDesc.Elements()};
@@ -162,7 +168,8 @@ const Component *DerivedType::FindDataComponent(
     INTERNAL_CHECK(component != nullptr);
     const Descriptor &nameDesc{component->name()};
     if (nameDesc.ElementBytes() == compNameLen &&
-        std::memcmp(compName, nameDesc.OffsetElement(), compNameLen) == 0) {
+        Fortran::runtime::memcmp(
+            compName, nameDesc.OffsetElement(), compNameLen) == 0) {
       return component;
     }
   }
@@ -170,6 +177,8 @@ const Component *DerivedType::FindDataComponent(
   return parent ? parent->FindDataComponent(compName, compNameLen) : nullptr;
 }
 
+RT_OFFLOAD_API_GROUP_END
+
 static void DumpScalarCharacter(
     FILE *f, const Descriptor &desc, const char *what) {
   if (desc.raw().version == CFI_VERSION &&
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index e93d594490b649..7d96a72e5f36d6 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -61,6 +61,9 @@ set(FLANG_TEST_DEPENDS
   llvm-objdump
   llvm-readobj
   split-file
+  FortranRuntime
+  Fortran_main
+  FortranDecimal
 )
 if (LLVM_ENABLE_PLUGINS AND NOT WIN32)
   list(APPEND FLANG_TEST_DEPENDS Bye)
diff --git a/flang/test/Driver/aarch64-sve-vector-bits.f90 b/flang/test/Driver/aarch64-sve-vector-bits.f90
new file mode 100644
index 00000000000000..cc3bbfb1d61512
--- /dev/null
+++ b/flang/test/Driver/aarch64-sve-vector-bits.f90
@@ -0,0 +1,65 @@
+! -----------------------------------------------------------------------------
+! Tests for the -msve-vector-bits flag (taken from the clang test)
+! -----------------------------------------------------------------------------
+
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=128 2>&1 | FileCheck --check-prefix=CHECK-128 %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=256 2>&1 | FileCheck --check-prefix=CHECK-256 %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=512 2>&1 | FileCheck --check-prefix=CHECK-512 %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=1024 2>&1 | FileCheck --check-prefix=CHECK-1024 %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=2048 2>&1 | FileCheck --check-prefix=CHECK-2048 %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=128+ 2>&1 | FileCheck --check-prefix=CHECK-128P %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=256+ 2>&1 | FileCheck --check-prefix=CHECK-256P %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=512+ 2>&1 | FileCheck --check-prefix=CHECK-512P %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=1024+ 2>&1 | FileCheck --check-prefix=CHECK-1024P %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=2048+ 2>&1 | FileCheck --check-prefix=CHECK-2048P %s
+! RUN: %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=scalable 2>&1 | FileCheck --check-prefix=CHECK-SCALABLE %s
+
+! CHECK-128: "-fc1"
+! CHECK-128-SAME: "-mvscale-max=1" "-mvscale-min=1"
+! CHECK-256: "-fc1"
+! CHECK-256-SAME: "-mvscale-max=2" "-mvscale-min=2"
+! CHECK-512: "-fc1"
+! CHECK-512-SAME: "-mvscale-max=4" "-mvscale-min=4"
+! CHECK-1024: "-fc1"
+! CHECK-1024-SAME: "-mvscale-max=8" "-mvscale-min=8"
+! CHECK-2048: "-fc1"
+! CHECK-2048-SAME: "-mvscale-max=16" "-mvscale-min=16"
+
+! CHECK-128P: "-fc1"
+! CHECK-128P-SAME: "-mvscale-min=1"
+! CHECK-128P-NOT: "-mvscale-max"
+! CHECK-256P: "-fc1"
+! CHECK-256P-SAME: "-mvscale-min=2"
+! CHECK-256P-NOT: "-mvscale-max"
+! CHECK-512P: "-fc1"
+! CHECK-512P-SAME: "-mvscale-min=4"
+! CHECK-512P-NOT: "-mvscale-max"
+! CHECK-1024P: "-fc1"
+! CHECK-1024P-SAME: "-mvscale-min=8"
+! CHECK-1024P-NOT: "-mvscale-max"
+! CHECK-2048P: "-fc1"
+! CHECK-2048P-SAME: "-mvscale-min=16"
+! CHECK-2048P-NOT: "-mvscale-max"
+! CHECK-SCALABLE-NOT: "-mvscale-min=
+! CHECK-SCALABLE-NOT: "-mvscale-max=
+
+! Error out if an unsupported value is passed to -msve-vector-bits.
+! -----------------------------------------------------------------------------
+! RUN: not %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=64 2>&1 | FileCheck --check-prefix=CHECK-BAD-VALUE-ERROR %s
+! RUN: not %flang -c %s -### --target=aarch64-none-linux-gnu -march=armv8-a+sve \
+! RUN:  -msve-vector-bits=A 2>&1 | FileCheck --check-prefix=CHECK-BAD-VALUE-ERROR %s
+
+! CHECK-BAD-VALUE-ERROR: error: unsupported argument '{{.*}}' to option '-msve-vector-bits='
+
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 7aae2b195815e2..807b0f938d27b5 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -110,6 +110,8 @@
 ! CHECK-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
 ! CHECK-NEXT: -mmlir <value>          Additional arguments to forward to MLIR's option processing
 ! CHECK-NEXT: -module-dir <dir>       Put MODULE files in <dir>
+! CHECK-NEXT: -msve-vector-bits=<value>
+! CHECK-NEXT:                          Specify the size in bits of an SVE vector register. Defaults to the vector length agnostic value of "scalable". (AArch64 only)
 ! CHECK-NEXT: --no-offload-arch=<value>
 ! CHECK-NEXT:                         Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. 'all' resets the list to its default value.
 ! CHECK-NEXT: -nocpp                  Disable predefined and command line preprocessor macros
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index 549bec6aca58c1..4894f90f531043 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -98,6 +98,8 @@
 ! HELP-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
 ! HELP-NEXT: -mmlir <value>          Additional arguments to forward to MLIR's option processing
 ! HELP-NEXT: -module-dir <dir>       Put MODULE files in <dir>
+! HELP-NEXT: -msve-vector-bits=<value>
+! HELP-NEXT:                          Specify the size in bits of an SVE vector register. Defaults to the vector length agnostic value of "scalable". (AArch64 only)
 ! HELP-NEXT: --no-offload-arch=<value>
 ! HELP-NEXT:                         Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. 'all' resets the list to its default value.
 ! HELP-NEXT: -nocpp                  Disable predefined and command line preprocessor macros
@@ -228,6 +230,8 @@
 ! HELP-FC1-NEXT: -mreassociate           Allow reassociation transformations for floating-point instructions
 ! HELP-FC1-NEXT: -mrelocation-model <value>
 ! HELP-FC1-NEXT:                         The relocation model to use
+! HELP-FC1-NEXT: -mvscale-max=<value>    Specify the vscale maximum. Defaults to the vector length agnostic value of "0". (AArch64/RISC-V only)
+! HELP-FC1-NEXT: -mvscale-min=<value>    Specify the vscale minimum. Defaults to "1". (AArch64/RISC-V only)
 ! HELP-FC1-NEXT: -nocpp                  Disable predefined and command line preprocessor macros
 ! HELP-FC1-NEXT: -opt-record-file <value>
 ! HELP-FC1-NEXT:                         File name to use for YAML optimization record output
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 2fbf3c4bf3450a..09b8a224df1382 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -24,18 +24,21 @@
 ! GNU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! GNU-SAME: "[[object_file]]"
 ! GNU-SAME: -lFortran_main
-! GNU-SAME: -lflang-rt
+! GNU-SAME: -lFortranRuntime
+! GNU-SAME: -lFortranDecimal
 ! GNU-SAME: -lm
 
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
 ! DARWIN-SAME: -lFortran_main
-! DARWIN-SAME: -lflang-rt
+! DARWIN-SAME: -lFortranRuntime
+! DARWIN-SAME: -lFortranDecimal
 
 ! MINGW-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! MINGW-SAME: "[[object_file]]"
 ! MINGW-SAME: -lFortran_main
-! MINGW-SAME: -lflang-rt
+! MINGW-SAME: -lFortranRuntime
+! MINGW-SAME: -lFortranDecimal
 
 ! NOTE: This also matches lld-link (when CLANG_DEFAULT_LINKER=lld) and
 !       any .exe suffix that is added when resolving to the full path of
@@ -43,6 +46,7 @@
 !       when the executable is not found or on non-Windows platforms.
 ! MSVC-LABEL: link
 ! MSVC-SAME: Fortran_main.lib
-! MSVC-SAME: flang-rt.lib
+! MSVC-SAME: FortranRuntime.lib
+! MSVC-SAME: FortranDecimal.lib
 ! MSVC-SAME: /subsystem:console
 ! MSVC-SAME: "[[object_file]]"
diff --git a/flang/test/Fir/array-value-copy-cam4.fir b/flang/test/Fir/array-value-copy-cam4.fir
index 3b3b0082743ce3..3bb465cfaa59ff 100644
--- a/flang/test/Fir/array-value-copy-cam4.fir
+++ b/flang/test/Fir/array-value-copy-cam4.fir
@@ -98,5 +98,5 @@ module {
     return
   }
   func.func private @_QPinit(!fir.ref<!fir.array<4x27xf64>>)
-  fir.dispatch_table @_QMcam4Tpbuf_fld
+  fir.type_info @_QMcam4Tpbuf_fld : !fir.type<_QMcam4Tpbuf_fld{fld_ptr:!fir.box<!fir.ptr<!fir.array<?x?x?x?x?xf64>>>}>
 }
diff --git a/flang/test/Fir/convert-to-llvm-invalid.fir b/flang/test/Fir/convert-to-llvm-invalid.fir
index fcd042f83bd03c..9f003e4eb7d59d 100644
--- a/flang/test/Fir/convert-to-llvm-invalid.fir
+++ b/flang/test/Fir/convert-to-llvm-invalid.fir
@@ -2,12 +2,6 @@
 
 // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" --verify-diagnostics %s
 
-// Verify that `fir.dt_entry` requires a parent op
-
-// expected-error@+1{{'fir.dt_entry' op expects parent op 'fir.dispatch_table'}}
-fir.dt_entry "method", @method_impl
-
-// -----
 
 // Test `fir.shape` conversion failure because the op has uses.
 
@@ -82,11 +76,28 @@ func.func @bar_select_type(%arg : !fir.class<!fir.type<derivedst{a:f32}>>) -> i3
 
 // Verify that `fir.dt_entry` requires a parent op
 
-// expected-error@+1{{'fir.dt_entry' op expects parent op 'fir.dispatch_table'}}
+// expected-error@+1{{'fir.dt_entry' op expects parent op 'fir.type_info'}}
 fir.dt_entry "method", @method_impl
 
 // -----
 
+// expected-error@+1{{'fir.type_info' op type must be a fir.type}}
+fir.type_info @bad : i32
+
+// -----
+
+// expected-error@+1{{'fir.type_info' op parent_type must be a fir.type}}
+fir.type_info @bad extends f32 : !fir.type<t1{i:i32}>
+
+// -----
+
+fir.type_info @bad : !fir.type<bad{i:i32}> dispatch_table {
+  // expected-error@+1{{dispatch table must contain dt_entry}}
+  %zero = arith.constant 0 : i32
+}
+
+// -----
+
 // `fir.coordinate_of` - dynamically sized arrays are not supported
 func.func @coordinate_of_dynamic_array(%arg0: !fir.ref<!fir.array<1x!fir.char<4,?>>>, %arg1: index) {
   // expected-error@+2{{fir.coordinate_of with a dynamic element size is unsupported}}
diff --git a/flang/test/Fir/dispatch.f90 b/flang/test/Fir/dispatch.f90
index 933c769d3e1693..ce60939410de69 100644
--- a/flang/test/Fir/dispatch.f90
+++ b/flang/test/Fir/dispatch.f90
@@ -308,10 +308,10 @@ program test_type_to_class
 ! Check the layout of the binding table. This is easier to do in FIR than in 
 ! LLVM IR.
 
-! BT-LABEL: fir.dispatch_table @_QMdispatch1Tty_kindK10K20
-! BT-LABEL: fir.dispatch_table @_QMdispatch1Tty_kind_exK10K20 extends("_QMdispatch1Tty_kindK10K20")
+! BT-LABEL: fir.type_info @_QMdispatch1Tty_kindK10K20
+! BT-LABEL: fir.type_info @_QMdispatch1Tty_kind_exK10K20 {{.*}}extends !fir.type<_QMdispatch1Tty_kindK10K20{{.*}}>
 
-! BT-LABEL: fir.dispatch_table @_QMdispatch1Tp1 {
+! BT-LABEL: fir.type_info @_QMdispatch1Tp1
 ! BT: fir.dt_entry "aproc", @_QMdispatch1Paproc
 ! BT: fir.dt_entry "display1", @_QMdispatch1Pdisplay1_p1
 ! BT: fir.dt_entry "display2", @_QMdispatch1Pdisplay2_p1
@@ -321,15 +321,15 @@ program test_type_to_class
 ! BT: fir.dt_entry "proc_with_values", @_QMdispatch1Pproc_p1
 ! BT: }
 
-! BT-LABEL: fir.dispatch_table @_QMdispatch1Ta1 {
+! BT-LABEL: fir.type_info @_QMdispatch1Ta1
 ! BT: fir.dt_entry "a1_proc", @_QMdispatch1Pa1_proc
 ! BT: }
 
-! BT-LABEL: fir.dispatch_table @_QMdispatch1Ta2 extends("_QMdispatch1Ta1") {
+! BT-LABEL: fir.type_info @_QMdispatch1Ta2 {{.*}}extends !fir.type<_QMdispatch1Ta1{{.*}}>
 ! BT:  fir.dt_entry "a1_proc", @_QMdispatch1Pa2_proc
 ! BT: }
 
-! BT-LABEL: fir.dispatch_table @_QMdispatch1Tp2 extends("_QMdispatch1Tp1") {
+! BT-LABEL: fir.type_info @_QMdispatch1Tp2 {{.*}}extends !fir.type<_QMdispatch1Tp1{{.*}}>
 ! BT:  fir.dt_entry "aproc", @_QMdispatch1Paproc
 ! BT:  fir.dt_entry "display1", @_QMdispatch1Pdisplay1_p2
 ! BT:  fir.dt_entry "display2", @_QMdispatch1Pdisplay2_p2
diff --git a/flang/test/Fir/fir-ops.fir b/flang/test/Fir/fir-ops.fir
index f2175ce5fb5aed..dd0fbb3be36c46 100644
--- a/flang/test/Fir/fir-ops.fir
+++ b/flang/test/Fir/fir-ops.fir
@@ -450,13 +450,16 @@ fir.global linkonce_odr @global_linkonce_odr : i32 {
   fir.has_value %0 : i32
 }
 
-// CHECK-LABEL: fir.dispatch_table @dispatch_tbl {
+// CHECK-LABEL: fir.type_info @dispatch_tbl : !fir.type<dispatch_tbl{i:i32}> dispatch_table {
 // CHECK: fir.dt_entry "method", @method_impl
 // CHECK: }
-fir.dispatch_table @dispatch_tbl {
+fir.type_info @dispatch_tbl : !fir.type<dispatch_tbl{i:i32}> dispatch_table {
   fir.dt_entry "method", @method_impl
 }
 
+// CHECK-LABEL: fir.type_info @test_type_info noinit nodestroy nofinal extends !fir.type<parent{i:i32}> : !fir.type<test_type_info{i:i32,j:f32}>
+fir.type_info @test_type_info noinit nodestroy nofinal extends !fir.type<parent{i:i32}> : !fir.type<test_type_info{i:i32,j:f32}>
+
 // CHECK-LABEL: func @compare_complex(
 // CHECK-SAME: [[VAL_151:%.*]]: !fir.complex<16>, [[VAL_152:%.*]]: !fir.complex<16>) {
 func.func @compare_complex(%a : !fir.complex<16>, %b : !fir.complex<16>) {
diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir
index f8a52b2e98db0c..eabc9f30127fa3 100644
--- a/flang/test/Fir/tbaa.fir
+++ b/flang/test/Fir/tbaa.fir
@@ -351,3 +351,37 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<?xi32>>) {
 // CHECK:           %[[VAL_14:.*]] = llvm.bitcast %[[VAL_13]] : !llvm.ptr<i8> to !llvm.ptr<i32>
 // CHECK:           llvm.return
 // CHECK:         }
+
+// -----
+
+// Check that the scalar aggregate load/store with a descriptor member
+// is mapped to any-access.
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
+// CHECK-DAG:     #[[$ANYT:.*]] = #llvm.tbaa_tag<base_type = #[[ANYACC]], access_type = #[[ANYACC]], offset = 0>
+
+func.func @tbaa(%arg0: !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>, %arg1: !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>) {
+  %0 = fir.load %arg0 : !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>
+  fir.store %0 to %arg1 : !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>
+  return
+}
+// CHECK-LABEL:   llvm.func @tbaa(
+// CHECK: llvm.load{{.*}}{tbaa = [#[[$ANYT]]]}
+// CHECK: llvm.store{{.*}}{tbaa = [#[[$ANYT]]]}
+
+// -----
+
+// Check that the array aggregate load/store with a descriptor member
+// is mapped to any-access.
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
+// CHECK-DAG:     #[[$ANYT:.*]] = #llvm.tbaa_tag<base_type = #[[ANYACC]], access_type = #[[ANYACC]], offset = 0>
+
+func.func @tbaa(%arg0: !fir.ref<!fir.array<2x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>, %arg1: !fir.ref<!fir.array<2x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>) {
+  %0 = fir.load %arg0 : !fir.ref<!fir.array<2x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>
+  fir.store %0 to %arg1 : !fir.ref<!fir.array<2x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>
+  return
+}
+// CHECK-LABEL:   llvm.func @tbaa(
+// CHECK: llvm.load{{.*}}{tbaa = [#[[$ANYT]]]}
+// CHECK: llvm.store{{.*}}{tbaa = [#[[$ANYT]]]}
diff --git a/flang/test/HLFIR/c-null-ptr-init.f90 b/flang/test/HLFIR/c-null-ptr-init.f90
new file mode 100644
index 00000000000000..379b8ccbe855c8
--- /dev/null
+++ b/flang/test/HLFIR/c-null-ptr-init.f90
@@ -0,0 +1,19 @@
+! Test lowering of C_NULL_PTR in structure constructor initial value.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+subroutine test
+  use, intrinsic :: iso_c_binding, only : c_ptr, c_null_ptr
+  type t
+     type(c_ptr) :: ptr
+  end type t
+  type(t) :: x = t(c_null_ptr)
+end subroutine
+! CHECK-LABEL:  fir.global internal @_QFtestEx : !fir.type<_QFtestTt{ptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}> {
+! CHECK:           %[[VAL_0:.*]] = fir.undefined !fir.type<_QFtestTt{ptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK:           %[[VAL_1:.*]] = fir.field_index ptr, !fir.type<_QFtestTt{ptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK:           %[[VAL_2:.*]] = fir.undefined !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK:           %[[VAL_3:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_5:.*]] = fir.insert_value %[[VAL_2]], %[[VAL_4]], ["__address", !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>] : (!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>, i64) -> !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK:           %[[VAL_6:.*]] = fir.insert_value %[[VAL_0]], %[[VAL_5]], ["ptr", !fir.type<_QFtestTt{ptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>] : (!fir.type<_QFtestTt{ptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> !fir.type<_QFtestTt{ptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK:           fir.has_value %[[VAL_6]] : !fir.type<_QFtestTt{ptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK:         }
diff --git a/flang/test/HLFIR/elemental-shallow-copy.fir b/flang/test/HLFIR/elemental-shallow-copy.fir
new file mode 100644
index 00000000000000..c57a2766e318de
--- /dev/null
+++ b/flang/test/HLFIR/elemental-shallow-copy.fir
@@ -0,0 +1,31 @@
+// Check that an elemental result of a derived type with an allocatable
+// component is shallow-copied into the array result.
+// RUN: fir-opt %s --bufferize-hlfir | FileCheck %s
+
+func.func @_QMtypesPtest() {
+  %false = arith.constant false
+  %c1 = arith.constant 1 : index
+  %0 = fir.alloca !fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}> {bindc_name = ".result"}
+  %11 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %18 = fir.alloca !fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>> {bindc_name = "y", uniq_name = "_QMtypesFtestEy"}
+  %19:2 = hlfir.declare %18(%11) {uniq_name = "_QMtypesFtestEy"} : (!fir.ref<!fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>, !fir.ref<!fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>)
+  %23 = hlfir.elemental %11 : (!fir.shape<1>) -> !hlfir.expr<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>> {
+  ^bb0(%arg0: index):
+    %26:2 = hlfir.declare %0 {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>, !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>)
+    %27 = hlfir.as_expr %26#0 move %false : (!fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>, i1) -> !hlfir.expr<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>
+    hlfir.yield_element %27 : !hlfir.expr<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>
+  }
+  hlfir.assign %23 to %19#0 : !hlfir.expr<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>, !fir.ref<!fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>
+  hlfir.destroy %23 : !hlfir.expr<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>
+  return
+}
+// CHECK-LABEL:   func.func @_QMtypesPtest() {
+// CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}> {bindc_name = ".result"}
+// CHECK:           %[[VAL_6:.*]] = fir.allocmem !fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>> {bindc_name = ".tmp.array", uniq_name = ""}
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%{{.*}}) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>, !fir.shape<1>) -> (!fir.heap<!fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>, !fir.heap<!fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>)
+// CHECK:           fir.do_loop %[[VAL_10:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>, !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>)
+// CHECK:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_10]])  : (!fir.heap<!fir.array<1x!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>>, index) -> !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>
+// CHECK:             %[[VAL_16:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>
+// CHECK:             fir.store %[[VAL_16]] to %[[VAL_15]] : !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}>>
+// CHECK:           }
diff --git a/flang/test/Lower/Arm/arm-sve-vector-bits-vscale-range.f90 b/flang/test/Lower/Arm/arm-sve-vector-bits-vscale-range.f90
new file mode 100644
index 00000000000000..9e09a411986267
--- /dev/null
+++ b/flang/test/Lower/Arm/arm-sve-vector-bits-vscale-range.f90
@@ -0,0 +1,24 @@
+! REQUIRES: aarch64-registered-target
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1 -mvscale-max=1  -emit-llvm -o - %s | FileCheck %s -D#VBITS=1
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=2 -mvscale-max=2  -emit-llvm -o - %s | FileCheck %s -D#VBITS=2
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=4 -mvscale-max=4  -emit-llvm -o - %s | FileCheck %s -D#VBITS=4
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=8 -mvscale-max=8  -emit-llvm -o - %s | FileCheck %s -D#VBITS=8
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=16 -mvscale-max=16  -emit-llvm -o - %s | FileCheck %s -D#VBITS=16
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=1 -mvscale-max=1  -emit-llvm -o - %s | FileCheck %s -D#VBITS=1
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=2 -mvscale-max=2  -emit-llvm -o - %s | FileCheck %s -D#VBITS=2
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1  -emit-llvm -o - %s | FileCheck %s -D#VBITS=1 --check-prefix=CHECK-NOMAX
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=2  -emit-llvm -o - %s | FileCheck %s -D#VBITS=2 --check-prefix=CHECK-NOMAX
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=4  -emit-llvm -o - %s | FileCheck %s -D#VBITS=4 --check-prefix=CHECK-NOMAX
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=8  -emit-llvm -o - %s | FileCheck %s -D#VBITS=8 --check-prefix=CHECK-NOMAX
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=16  -emit-llvm -o - %s | FileCheck %s -D#VBITS=16 --check-prefix=CHECK-NOMAX
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=1 -mvscale-max=0  -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-UNBOUNDED
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1 -mvscale-max=0  -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-UNBOUNDED
+! RUN: %flang_fc1 -triple aarch64-none-linux-gnu -target-feature +sve -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE
+
+! CHECK-LABEL: @func_() #0
+! CHECK: attributes #0 = {{{.*}} vscale_range([[#VBITS]],[[#VBITS]]) {{.*}}}
+! CHECK-NOMAX: attributes #0 = {{{.*}} vscale_range([[#VBITS]],0) {{.*}}}
+! CHECK-UNBOUNDED: attributes #0 = {{{.*}} vscale_range(1,0) {{.*}}}
+! CHECK-NONE: attributes #0 = {{{.*}} vscale_range(1,16) {{.*}}}
+subroutine func
+end subroutine func
diff --git a/flang/test/Lower/HLFIR/bindc-entry-stmt.f90 b/flang/test/Lower/HLFIR/bindc-entry-stmt.f90
index 89e494689e9964..3be14ed95e1f03 100644
--- a/flang/test/Lower/HLFIR/bindc-entry-stmt.f90
+++ b/flang/test/Lower/HLFIR/bindc-entry-stmt.f90
@@ -24,15 +24,14 @@ function foo() bind(c)
 ! CHECK-LABEL:   func.func @_QPbar(
 ! CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<!fir.char<1>>,
 ! CHECK-SAME:                      %[[VAL_1:.*]]: index) -> !fir.boxchar<1> {
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[VAL_3]] {uniq_name = "_QFfooEfoo"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] {uniq_name = "_QFfooEfoo"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[VAL_5]] {uniq_name = "_QFfooEbar"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_5]] {uniq_name = "_QFfooEbar"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
 ! CHECK:           cf.br ^bb1
 ! CHECK:         ^bb1:
-! CHECK:           hlfir.assign %{{.*}} to %[[VAL_6]]#0 : !fir.ref<!fir.char<1>>, !fir.boxchar<1>
-! CHECK:           %[[VAL_10:.*]] = fir.emboxchar %[[VAL_6]]#1, %[[VAL_5]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           hlfir.assign %{{.*}} to %[[VAL_6]]#0 : !fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>
+! CHECK:           %[[VAL_10:.*]] = fir.emboxchar %[[VAL_6]]#1, %[[VAL_5]] : (!fir.ref<!fir.char<1>>, index) -> !fir.boxchar<1>
 ! CHECK:           return %[[VAL_10]] : !fir.boxchar<1>
 ! CHECK:         }
 
@@ -44,15 +43,14 @@ function foo2()
 ! CHECK-LABEL:   func.func @_QPfoo2(
 ! CHECK-SAME:                       %[[VAL_0:.*]]: !fir.ref<!fir.char<1>>,
 ! CHECK-SAME:                       %[[VAL_1:.*]]: index) -> !fir.boxchar<1> {
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[VAL_3]] {uniq_name = "_QFfoo2Efoo2"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] {uniq_name = "_QFfoo2Efoo2"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[VAL_5]] {uniq_name = "_QFfoo2Ebar2"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_5]] {uniq_name = "_QFfoo2Ebar2"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
 ! CHECK:           cf.br ^bb1
 ! CHECK:         ^bb1:
-! CHECK:           hlfir.assign %{{.*}} to %[[VAL_6]]#0 : !fir.ref<!fir.char<1>>, !fir.boxchar<1>
-! CHECK:           %[[VAL_10:.*]] = fir.emboxchar %[[VAL_4]]#1, %[[VAL_3]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           hlfir.assign %{{.*}} to %[[VAL_6]]#0 : !fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>
+! CHECK:           %[[VAL_10:.*]] = fir.emboxchar %[[VAL_4]]#1, %[[VAL_3]] : (!fir.ref<!fir.char<1>>, index) -> !fir.boxchar<1>
 ! CHECK:           return %[[VAL_10]] : !fir.boxchar<1>
 ! CHECK:         }
 
diff --git a/flang/test/Lower/HLFIR/calls-assumed-shape.f90 b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
index 447da252d77bde..58c41d46167d7d 100644
--- a/flang/test/Lower/HLFIR/calls-assumed-shape.f90
+++ b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
@@ -90,8 +90,8 @@ subroutine takes_assumed_character(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_explicit_char_to_box(
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<20x!fir.char<1,10>>>
+! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5:[a-z0-9]*]]) typeparams %[[VAL_2:[a-z0-9]*]] {uniq_name = "_QFtest_explicit_char_to_boxEe"} : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.ref<!fir.array<20x!fir.char<1,10>>>)
diff --git a/flang/test/Lower/HLFIR/char_extremum.f03 b/flang/test/Lower/HLFIR/char_extremum.f03
index 32a519ce3b681e..cc7b80184935ea 100644
--- a/flang/test/Lower/HLFIR/char_extremum.f03
+++ b/flang/test/Lower/HLFIR/char_extremum.f03
@@ -45,14 +45,14 @@ subroutine max2(c1, c2, c3)
 ! CHECK:  %[[VAL_2:[a-zA-Z0-9_]*]]  = fir.shape %[[VAL_C100]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) typeparams %[[VAL_0]]#1 {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
 ! CHECK:  %[[VAL_4:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_C10:[a-zA-Z0-9_]*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,10>>>
+! CHECK:  %[[VAL_C10:[a-zA-Z0-9_]*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_C100_0:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_6:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100_0]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_6]]) typeparams %[[VAL_C10]] {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.ref<!fir.array<100x!fir.char<1,10>>>)
 ! CHECK:  %[[VAL_8:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_C20:[a-zA-Z0-9_]*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_9:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,20>>>
+! CHECK:  %[[VAL_C20:[a-zA-Z0-9_]*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_C100_1:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_10:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100_1]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_11:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_10]]) typeparams %[[VAL_C20]] {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.ref<!fir.array<100x!fir.char<1,20>>>)
@@ -78,14 +78,14 @@ subroutine min2(c1, c2, c3)
 ! CHECK:  %[[VAL_2:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) typeparams %[[VAL_0]]#1 {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
 ! CHECK:  %[[VAL_4:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_C10:[a-zA-Z0-9_]*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,10>>>
+! CHECK:  %[[VAL_C10:[a-zA-Z0-9_]*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_C100_0:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_6:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100_0]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_6]]) typeparams %[[VAL_C10]] {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.ref<!fir.array<100x!fir.char<1,10>>>)
 ! CHECK:  %[[VAL_8:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_C20:[a-zA-Z0-9_]*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_C9:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,20>>>
+! CHECK:  %[[VAL_C20:[a-zA-Z0-9_]*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_C100_1:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_10:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100_1]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_11:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_C9]](%[[VAL_10]]) typeparams %[[VAL_C20]] {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.ref<!fir.array<100x!fir.char<1,20>>>)
diff --git a/flang/test/Lower/HLFIR/convert-variable.f90 b/flang/test/Lower/HLFIR/convert-variable.f90
index 746ac085ad115f..e7487ef870d1dd 100644
--- a/flang/test/Lower/HLFIR/convert-variable.f90
+++ b/flang/test/Lower/HLFIR/convert-variable.f90
@@ -22,8 +22,8 @@ subroutine scalar_character_cst_len(c)
 ! CHECK-LABEL: func.func @_QPscalar_character_cst_len(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<1>
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,10>>
+! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_3]] typeparams %[[VAL_2]] {uniq_name = "_QFscalar_character_cst_lenEc"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
 
 subroutine array_numeric(x)
diff --git a/flang/test/Lower/HLFIR/elemental-array-ops.f90 b/flang/test/Lower/HLFIR/elemental-array-ops.f90
index 582912305c0068..96cda16cad2e6a 100644
--- a/flang/test/Lower/HLFIR/elemental-array-ops.f90
+++ b/flang/test/Lower/HLFIR/elemental-array-ops.f90
@@ -185,8 +185,10 @@ end subroutine char_return
 ! CHECK:             %[[VAL_26:.*]] = fir.call @llvm.stacksave.p0() fastmath<contract> : () -> !fir.ref<i8>
 ! CHECK:             %[[VAL_27:.*]] = fir.call @_QPcallee(%[[VAL_2]], %[[VAL_25]], %[[VAL_20]]) fastmath<contract> : (!fir.ref<!fir.char<1,3>>, index, !fir.boxchar<1>) -> !fir.boxchar<1>
 ! CHECK:             %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[VAL_25]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
+! CHECK:             %[[MustFree:.*]] = arith.constant false
+! CHECK:             %[[ResultTemp:.*]] = hlfir.as_expr %[[VAL_28]]#0 move %[[MustFree]] : (!fir.ref<!fir.char<1,3>>, i1) -> !hlfir.expr<!fir.char<1,3>>
 ! CHECK:             fir.call @llvm.stackrestore.p0(%[[VAL_26]]) fastmath<contract> : (!fir.ref<i8>) -> ()
-! CHECK:             hlfir.yield_element %[[VAL_28]]#0 : !fir.ref<!fir.char<1,3>>
+! CHECK:             hlfir.yield_element %[[ResultTemp]] : !hlfir.expr<!fir.char<1,3>>
 ! CHECK:           }
 ! CHECK:           %[[VAL_29:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_10]]#0, %[[VAL_29]] : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/HLFIR/elemental-call-vector-subscripts.f90 b/flang/test/Lower/HLFIR/elemental-call-vector-subscripts.f90
new file mode 100644
index 00000000000000..b8f7ee9338fbd1
--- /dev/null
+++ b/flang/test/Lower/HLFIR/elemental-call-vector-subscripts.f90
@@ -0,0 +1,93 @@
+! Test passing of vector subscripted entities inside elemental
+! procedures.
+! RUN: bbc --emit-hlfir -o - %s | FileCheck %s
+
+subroutine test()
+  interface
+    elemental subroutine foo(x, y)
+      real, intent(in) :: x
+      real, value :: y
+    end subroutine
+  end interface
+  real :: x(10)
+  call foo(x([1,3,7]), 0.)
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest() {
+! CHECK:           %[[VAL_0:.*]] = arith.constant 10 : index
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<10xf32> {bindc_name = "x", uniq_name = "_QFtestEx"}
+! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) {uniq_name = "_QFtestEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QQro.3xi8.0) : !fir.ref<!fir.array<3xi64>>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_6]])
+! CHECK:           %[[VAL_8:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_11:.*]] = %[[VAL_10]] to %[[VAL_8]] step %[[VAL_10]] unordered {
+! CHECK:             %[[VAL_12:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_11]])  : (!fir.ref<!fir.array<3xi64>>, index) -> !fir.ref<i64>
+! CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_12]] : !fir.ref<i64>
+! CHECK:             %[[VAL_14:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_13]])  : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
+! CHECK:             fir.call @_QPfoo(%[[VAL_14]], %[[VAL_9]]) {{.*}}: (!fir.ref<f32>, f32) -> ()
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine test_value()
+  interface
+    elemental subroutine foo_value(x, y)
+      real, value :: x
+      real, value :: y
+    end subroutine
+  end interface
+  real :: x(10)
+  call foo_value(x([1,3,7]), 0.)
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPtest_value() {
+! CHECK:           %[[VAL_0:.*]] = arith.constant 10 : index
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<10xf32> {bindc_name = "x", uniq_name = "_QFtest_valueEx"}
+! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) {uniq_name = "_QFtest_valueEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QQro.3xi8.0) : !fir.ref<!fir.array<3xi64>>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_6]])
+! CHECK:           %[[VAL_8:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<1>) -> !hlfir.expr<3xf32> {
+! CHECK:           ^bb0(%[[VAL_11:.*]]: index):
+! CHECK:             %[[VAL_12:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_11]])  : (!fir.ref<!fir.array<3xi64>>, index) -> !fir.ref<i64>
+! CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_12]] : !fir.ref<i64>
+! CHECK:             %[[VAL_14:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_13]])  : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
+! CHECK:             %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<f32>
+! CHECK:             hlfir.yield_element %[[VAL_15]] : f32
+! CHECK:           }
+! CHECK:           %[[VAL_16:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           %[[VAL_17:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_18:.*]] = %[[VAL_17]] to %[[VAL_8]] step %[[VAL_17]] unordered {
+! CHECK:             %[[VAL_19:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_18]] : (!hlfir.expr<3xf32>, index) -> f32
+! CHECK:             fir.call @_QPfoo_value(%[[VAL_19]], %[[VAL_16]]) {{.*}}: (f32, f32) -> ()
+! CHECK:           }
+! CHECK:           hlfir.destroy %[[VAL_10]] : !hlfir.expr<3xf32>
+! CHECK:           return
+
+subroutine test_not_a_variable(i)
+  interface
+    elemental subroutine foo2(j)
+      integer(8), intent(in) :: j
+    end subroutine
+  end interface
+  integer(8) :: i(:)
+  call foo2((i(i)))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_not_a_variable(
+! CHECK:           hlfir.elemental
+! CHECK:           %[[VAL_16:.*]] = hlfir.elemental
+! CHECK:           %[[VAL_20:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_21:.*]] = {{.*}}
+! CHECK:             %[[VAL_22:.*]] = hlfir.apply %[[VAL_16]], %[[VAL_21]] : (!hlfir.expr<?xi64>, index) -> i64
+! CHECK:             %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_22]] {uniq_name = "adapt.valuebyref"} : (i64) -> (!fir.ref<i64>, !fir.ref<i64>, i1)
+! CHECK:             fir.call @_QPfoo2(%[[VAL_23]]#1){{.*}}: (!fir.ref<i64>) -> ()
+! CHECK:             hlfir.end_associate %[[VAL_23]]#1, %[[VAL_23]]#2 : !fir.ref<i64>, i1
+! CHECK:           }
diff --git a/flang/test/Lower/HLFIR/function-return-as-expr.f90 b/flang/test/Lower/HLFIR/function-return-as-expr.f90
new file mode 100644
index 00000000000000..7c8e73779d79e9
--- /dev/null
+++ b/flang/test/Lower/HLFIR/function-return-as-expr.f90
@@ -0,0 +1,135 @@
+! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nowhere 2>&1 | FileCheck %s
+
+module types
+  type t1
+  end type t1
+end module types
+
+subroutine test1
+  integer :: i
+  i = inner() + 1
+contains
+  function inner()
+    integer, allocatable ::  inner
+  end function inner
+end subroutine test1
+! CHECK-LABEL:   func.func @_QPtest1() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = ".result"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_6]] : !fir.heap<i32>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_7]], %[[VAL_8]] : i32
+
+subroutine test2
+  character(len=:), allocatable :: c
+  c = inner()
+contains
+  function inner()
+    character(len=:), allocatable ::  inner
+  end function inner
+end subroutine test2
+! CHECK-LABEL:   func.func @_QPtest2() {
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
+! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_11:.*]] = fir.box_elesize %[[VAL_10]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_12:.*]] = fir.emboxchar %[[VAL_9]], %[[VAL_11]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_13:.*]] = arith.constant false
+! CHECK:           %[[VAL_14:.*]] = hlfir.as_expr %[[VAL_12]] move %[[VAL_13]] : (!fir.boxchar<1>, i1) -> !hlfir.expr<!fir.char<1,?>>
+! CHECK:           hlfir.assign %[[VAL_14]] to %{{.*}}#0 realloc : !hlfir.expr<!fir.char<1,?>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+
+subroutine test3
+  character(len=:), allocatable :: c
+  c = inner()
+contains
+  function inner()
+    character(len=3), allocatable ::  inner
+  end function inner
+end subroutine test3
+! CHECK-LABEL:   func.func @_QPtest3() {
+! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %{{.*}} typeparams %{{.*}} {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,3>>>>, index) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,3>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,3>>>>)
+! CHECK:           %[[VAL_14:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,3>>>>
+! CHECK:           %[[VAL_15:.*]] = fir.box_addr %[[VAL_14]] : (!fir.box<!fir.heap<!fir.char<1,3>>>) -> !fir.heap<!fir.char<1,3>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant false
+! CHECK:           %[[VAL_17:.*]] = hlfir.as_expr %[[VAL_15]] move %[[VAL_16]] : (!fir.heap<!fir.char<1,3>>, i1) -> !hlfir.expr<!fir.char<1,3>>
+! CHECK:           hlfir.assign %[[VAL_17]] to %{{.*}}#0 realloc : !hlfir.expr<!fir.char<1,3>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+
+subroutine test4
+  class(*), allocatable :: p
+  p = inner()
+contains
+  function inner()
+    class(*), allocatable :: inner
+  end function inner
+end subroutine test4
+! CHECK-LABEL:   func.func @_QPtest4() {
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.class<!fir.heap<none>>>) -> (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.ref<!fir.class<!fir.heap<none>>>)
+! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<!fir.class<!fir.heap<none>>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant false
+! CHECK:           %[[VAL_9:.*]] = hlfir.as_expr %[[VAL_7]] move %[[VAL_8]] : (!fir.class<!fir.heap<none>>, i1) -> !hlfir.expr<none?>
+! CHECK:           hlfir.assign %[[VAL_9]] to %{{.*}}#0 realloc : !hlfir.expr<none?>, !fir.ref<!fir.class<!fir.heap<none>>>
+
+subroutine test5
+  use types
+  type(t1) :: r
+  r = inner()
+contains
+  function inner()
+    type(t1) :: inner
+  end function inner
+end subroutine test5
+! CHECK-LABEL:   func.func @_QPtest5() {
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.type<_QMtypesTt1>>) -> (!fir.ref<!fir.type<_QMtypesTt1>>, !fir.ref<!fir.type<_QMtypesTt1>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant false
+! CHECK:           %[[VAL_6:.*]] = hlfir.as_expr %[[VAL_4]]#0 move %[[VAL_5]] : (!fir.ref<!fir.type<_QMtypesTt1>>, i1) -> !hlfir.expr<!fir.type<_QMtypesTt1>>
+! CHECK:           hlfir.assign %[[VAL_6]] to %{{.*}}#0 : !hlfir.expr<!fir.type<_QMtypesTt1>>, !fir.ref<!fir.type<_QMtypesTt1>>
+
+subroutine test6(x)
+  character(len=:), allocatable :: c(:)
+  integer :: x(:)
+  c = inner(x)
+contains
+  elemental function inner(x)
+    integer, intent(in) ::  x
+    character(len=3) ::  inner
+  end function inner
+end subroutine test6
+! CHECK-LABEL:   func.func @_QPtest6(
+! CHECK:           %[[VAL_14:.*]] = hlfir.elemental
+! CHECK:             %[[VAL_24:.*]]:2 = hlfir.declare %{{.*}} typeparams %{{.*}} {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
+! CHECK:             %[[VAL_25:.*]] = arith.constant false
+! CHECK:             %[[VAL_26:.*]] = hlfir.as_expr %[[VAL_24]]#0 move %[[VAL_25]] : (!fir.ref<!fir.char<1,3>>, i1) -> !hlfir.expr<!fir.char<1,3>>
+! CHECK:             hlfir.yield_element %[[VAL_26]] : !hlfir.expr<!fir.char<1,3>>
+
+subroutine test7(x)
+  use types
+  integer :: x(:)
+  class(*), allocatable :: p(:)
+  p = inner(x)
+contains
+  elemental function inner(x)
+    integer, intent(in) :: x
+    type(t1) :: inner
+  end function inner
+end subroutine test7
+! CHECK-LABEL:   func.func @_QPtest7(
+! CHECK:           %[[VAL_12:.*]] = hlfir.elemental
+! CHECK:             %[[VAL_16:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.type<_QMtypesTt1>>) -> (!fir.ref<!fir.type<_QMtypesTt1>>, !fir.ref<!fir.type<_QMtypesTt1>>)
+! CHECK:             %[[VAL_17:.*]] = arith.constant false
+! CHECK:             %[[VAL_18:.*]] = hlfir.as_expr %[[VAL_16]]#0 move %[[VAL_17]] : (!fir.ref<!fir.type<_QMtypesTt1>>, i1) -> !hlfir.expr<!fir.type<_QMtypesTt1>>
+! CHECK:             hlfir.yield_element %[[VAL_18]] : !hlfir.expr<!fir.type<_QMtypesTt1>>
+
+subroutine test8
+  if (associated(inner())) STOP 1
+contains
+  function inner()
+    real, pointer :: inner
+  end function inner
+end subroutine test8
+! CHECK-LABEL:   func.func @_QPtest8() {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.ptr<f32>>>)
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
diff --git a/flang/test/Lower/HLFIR/function-return-destroy.f90 b/flang/test/Lower/HLFIR/function-return-destroy.f90
new file mode 100644
index 00000000000000..4663dc57e0457f
--- /dev/null
+++ b/flang/test/Lower/HLFIR/function-return-destroy.f90
@@ -0,0 +1,197 @@
+! RUN: bbc -emit-hlfir -polymorphic-type %s -o - -I nowhere | FileCheck %s
+
+module types
+  type t1
+     real :: x
+  end type t1
+  type t2
+     real, allocatable :: x
+  end type t2
+  type t3
+     real, pointer :: p
+  end type t3
+  type t4
+     type(t1) :: c
+  end type t4
+  type t5
+     type(t2) :: c
+  end type t5
+  type t6
+   contains
+     final :: finalize_t6
+  end type t6
+  type, extends(t1) :: t7
+  end type t7
+  type, extends(t2) :: t8
+  end type t8
+  type, extends(t6) :: t9
+  end type t9
+contains
+  subroutine finalize_t6(x)
+    type(t6), intent(inout) :: x
+  end subroutine finalize_t6
+end module types
+
+subroutine test1
+  use types
+  interface
+     function ret_type_t1
+       use types
+       type(t1) :: ret_type_t1
+     end function ret_type_t1
+  end interface
+  type(t1) :: x
+  x = ret_type_t1()
+end subroutine test1
+! CHECK-LABEL:   func.func @_QPtest1() {
+! CHECK-NOT: fir.call{{.*}}Destroy
+
+subroutine test1a
+  use types
+  interface
+     function ret_type_t1a
+       use types
+       type(t1), allocatable :: ret_type_t1a
+     end function ret_type_t1a
+  end interface
+  type(t1), allocatable :: x
+  x = ret_type_t1a()
+end subroutine test1a
+! CHECK-LABEL:   func.func @_QPtest1a() {
+! CHECK-NOT: fir.call{{.*}}Destroy
+! CHECK:           fir.if %{{.*}} {
+! CHECK-NEXT:        fir.freemem %{{.*}} : !fir.heap<!fir.type<_QMtypesTt1{x:f32}>>
+! CHECK-NOT: fir.call{{.*}}Destroy
+! CHECK:           fir.if %{{.*}} {
+! CHECK:             fir.call @_FortranAAllocatableDeallocate({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK-NOT: fir.call{{.*}}Destroy
+
+subroutine test1c
+  use types
+  interface
+     function ret_class_t1
+       use types
+       class(t1), allocatable :: ret_class_t1
+     end function ret_class_t1
+  end interface
+  type(t1) :: x
+  x = ret_class_t1()
+end subroutine test1c
+! CHECK-LABEL:   func.func @_QPtest1c() {
+! CHECK: fir.call @_FortranADestroy
+! CHECK:           fir.if %{{.*}} {
+! CHECK-NEXT:        fir.freemem %{{.*}} : !fir.heap<!fir.type<_QMtypesTt1{x:f32}>>
+
+subroutine test2
+  use types
+  interface
+     function ret_type_t2
+       use types
+       type(t2) :: ret_type_t2
+     end function ret_type_t2
+  end interface
+  type(t2) :: x
+  x = ret_type_t2()
+end subroutine test2
+! CHECK-LABEL:   func.func @_QPtest2() {
+! CHECK: fir.call @_FortranADestroy
+
+subroutine test3
+  use types
+  interface
+     function ret_type_t3
+       use types
+       type(t3) :: ret_type_t3
+     end function ret_type_t3
+  end interface
+  type(t3) :: x
+  x = ret_type_t3()
+end subroutine test3
+! CHECK-LABEL:   func.func @_QPtest3() {
+! CHECK-NOT: fir.call{{.*}}Destroy
+
+subroutine test4
+  use types
+  interface
+     function ret_type_t4
+       use types
+       type(t4) :: ret_type_t4
+     end function ret_type_t4
+  end interface
+  type(t4) :: x
+  x = ret_type_t4()
+end subroutine test4
+! CHECK-LABEL:   func.func @_QPtest4() {
+! CHECK-NOT: fir.call{{.*}}Destroy
+
+subroutine test5
+  use types
+  interface
+     function ret_type_t5
+       use types
+       type(t5) :: ret_type_t5
+     end function ret_type_t5
+  end interface
+  type(t5) :: x
+  x = ret_type_t5()
+end subroutine test5
+! CHECK-LABEL:   func.func @_QPtest5() {
+! CHECK: fir.call @_FortranADestroy
+
+subroutine test6
+  use types
+  interface
+     function ret_type_t6
+       use types
+       type(t6) :: ret_type_t6
+     end function ret_type_t6
+  end interface
+  type(t6) :: x
+  x = ret_type_t6()
+end subroutine test6
+! CHECK-LABEL:   func.func @_QPtest6() {
+! CHECK: fir.call @_FortranADestroy
+! CHECK: fir.call @_FortranADestroy
+
+subroutine test7
+  use types
+  interface
+     function ret_type_t7
+       use types
+       type(t7) :: ret_type_t7
+     end function ret_type_t7
+  end interface
+  type(t7) :: x
+  x = ret_type_t7()
+end subroutine test7
+! CHECK-LABEL:   func.func @_QPtest7() {
+! CHECK-NOT: fir.call{{.*}}Destroy
+
+subroutine test8
+  use types
+  interface
+     function ret_type_t8
+       use types
+       type(t8) :: ret_type_t8
+     end function ret_type_t8
+  end interface
+  type(t8) :: x
+  x = ret_type_t8()
+end subroutine test8
+! CHECK-LABEL:   func.func @_QPtest8() {
+! CHECK: fir.call @_FortranADestroy
+
+subroutine test9
+  use types
+  interface
+     function ret_type_t9
+       use types
+       type(t9) :: ret_type_t9
+     end function ret_type_t9
+  end interface
+  type(t9) :: x
+  x = ret_type_t9()
+end subroutine test9
+! CHECK-LABEL:   func.func @_QPtest9() {
+! CHECK: fir.call @_FortranADestroy
+! CHECK: fir.call @_FortranADestroy
diff --git a/flang/test/Lower/HLFIR/function-return.f90 b/flang/test/Lower/HLFIR/function-return.f90
index 42f66c3d03e14f..8127133697c981 100644
--- a/flang/test/Lower/HLFIR/function-return.f90
+++ b/flang/test/Lower/HLFIR/function-return.f90
@@ -17,10 +17,9 @@ integer function simple_return()
 end function
 ! CHECK-LABEL: func.func @_QPchar_return(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.char<1,10>>
-! CHECK:  %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 10 : index
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[VAL_3]] {uniq_name = "_QFchar_returnEchar_return"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK:  %[[VAL_8:.*]] = fir.emboxchar %[[VAL_4]]#1, %[[VAL_3]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] {uniq_name = "_QFchar_returnEchar_return"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+! CHECK:  %[[VAL_8:.*]] = fir.emboxchar %[[VAL_4]]#1, %[[VAL_3]] : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
 ! CHECK:  return %[[VAL_8]] : !fir.boxchar<1>
 
 integer function array_return()
diff --git a/flang/test/Lower/HLFIR/local-end-of-scope-component-dealloc.f90 b/flang/test/Lower/HLFIR/local-end-of-scope-component-dealloc.f90
new file mode 100644
index 00000000000000..b63026b1e3f103
--- /dev/null
+++ b/flang/test/Lower/HLFIR/local-end-of-scope-component-dealloc.f90
@@ -0,0 +1,111 @@
+! Test automatic deallocation of allocatable components
+! of local variables as described in Fortran 2018 standard
+! 9.7.3.2 point 2. and 3.
+! The allocatable components of local variables are local variables
+! themselves due to 5.4.3.2.2 p. 2, note 1.
+! RUN: bbc -emit-hlfir -o - -I nowhere %s | FileCheck %s
+
+module types
+  type t1
+     real, allocatable :: x
+  end type t1
+  type t2
+     type(t1) :: x
+  end type t2
+  type, extends(t1) :: t3
+  end type t3
+  type, extends(t3) :: t4
+  end type t4
+  type, extends(t2) :: t5
+  end type t5
+end module types
+
+subroutine test1()
+  use types
+  type(t1) :: x1
+end subroutine test1
+! CHECK-LABEL:   func.func @_QPtest1() {
+! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>) -> !fir.box<none>
+
+subroutine test1b()
+  use types
+  block
+    type(t1) :: x1
+  end block
+end subroutine test1b
+! CHECK-LABEL:   func.func @_QPtest1b() {
+! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>) -> !fir.box<none>
+
+subroutine test2()
+  use types
+  type(t2) :: x2
+end subroutine test2
+! CHECK-LABEL:   func.func @_QPtest2() {
+! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt2{x:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>>) -> !fir.box<none>
+
+subroutine test2b()
+  use types
+  block
+    type(t2) :: x2
+  end block
+end subroutine test2b
+! CHECK-LABEL:   func.func @_QPtest2b() {
+! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt2{x:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>>) -> !fir.box<none>
+
+subroutine test3()
+  use types
+  type(t3) :: x3
+end subroutine test3
+! CHECK-LABEL:   func.func @_QPtest3() {
+! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt3{x:!fir.box<!fir.heap<f32>>}>>) -> !fir.box<none>
+
+subroutine test3b()
+  use types
+  block
+    type(t3) :: x3
+  end block
+end subroutine test3b
+! CHECK-LABEL:   func.func @_QPtest3b() {
+! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt3{x:!fir.box<!fir.heap<f32>>}>>) -> !fir.box<none>
+
+subroutine test4()
+  use types
+  type(t4) :: x4
+end subroutine test4
+! CHECK-LABEL:   func.func @_QPtest4() {
+! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt4{x:!fir.box<!fir.heap<f32>>}>>) -> !fir.box<none>
+
+subroutine test4b()
+  use types
+  block
+    type(t4) :: x4
+  end block
+end subroutine test4b
+! CHECK-LABEL:   func.func @_QPtest4b() {
+! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt4{x:!fir.box<!fir.heap<f32>>}>>) -> !fir.box<none>
+
+subroutine test5()
+  use types
+  type(t5) :: x5
+end subroutine test5
+! CHECK-LABEL:   func.func @_QPtest5() {
+! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt5{x:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>>) -> !fir.box<none>
+
+subroutine test5b()
+  use types
+  block
+    type(t5) :: x5
+  end block
+end subroutine test5b
+! CHECK-LABEL:   func.func @_QPtest5b() {
+! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt5{x:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>>) -> !fir.box<none>
diff --git a/flang/test/Lower/HLFIR/structure-constructor.f90 b/flang/test/Lower/HLFIR/structure-constructor.f90
index 90d5922d67a675..797f801ad64240 100644
--- a/flang/test/Lower/HLFIR/structure-constructor.f90
+++ b/flang/test/Lower/HLFIR/structure-constructor.f90
@@ -41,8 +41,8 @@ end subroutine test1
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.type<_QMtypesTt1{c:!fir.char<1,4>}> {bindc_name = "res", uniq_name = "_QFtest1Eres"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest1Eres"} : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, !fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_5]] {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, !fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.embox %[[VAL_8]]#0 : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.box<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>
@@ -220,8 +220,8 @@ end subroutine test6
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.type<_QMtypesTt6{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>
 ! CHECK:           %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_8:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_8]] {uniq_name = "_QFtest6Ec"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK:           %[[VAL_11:.*]] = fir.alloca !fir.type<_QMtypesTt6{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}> {bindc_name = "res", uniq_name = "_QFtest6Eres"}
 ! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFtest6Eres"} : (!fir.ref<!fir.type<_QMtypesTt6{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt6{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>)
diff --git a/flang/test/Lower/HLFIR/type-info.f90 b/flang/test/Lower/HLFIR/type-info.f90
new file mode 100644
index 00000000000000..e0716fd069020e
--- /dev/null
+++ b/flang/test/Lower/HLFIR/type-info.f90
@@ -0,0 +1,60 @@
+! Test generation of noinit, nofinal, and nodestroy fir.type_info attributes
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+module tyinfo
+  type boring_type
+  end type
+
+  type needs_final
+    contains
+      final :: needs_final_final
+  end type
+
+  type needs_init1
+    integer :: i =0
+  end type
+
+  type needs_init_and_destroy
+    integer, allocatable :: x
+  end type
+
+  type needs_all
+    type(needs_final) :: x
+    type(needs_init_and_destroy) :: y
+  end type
+
+  type, extends(needs_final) :: inherits_final
+  end type
+  type, extends(needs_init1) :: inherits_init
+  end type
+  type, extends(needs_init_and_destroy) :: inherits_init_and_destroy
+  end type
+  type, extends(needs_all) :: inherits_all
+  end type
+
+  interface
+    subroutine needs_final_final(x)
+      type(needs_final), intent(inout) :: x
+    end subroutine
+  end interface
+
+  type(boring_type) :: x1
+  type(needs_final) :: x2
+  type(needs_init1) :: x3
+  type(needs_init_and_destroy) :: x4
+  type(needs_all) :: x5
+  type(inherits_final) :: x6
+  type(inherits_init) :: x7
+  type(inherits_init_and_destroy) :: x8
+  type(inherits_all) :: x9
+end module
+
+! CHECK-DAG:  fir.type_info @_QMtyinfoTboring_type noinit nodestroy nofinal : !fir.type<_QMtyinfoTboring_type>
+! CHECK-DAG:  fir.type_info @_QMtyinfoTneeds_final noinit : !fir.type<_QMtyinfoTneeds_final>
+! CHECK-DAG:  fir.type_info @_QMtyinfoTneeds_init1 nodestroy nofinal : !fir.type<_QMtyinfoTneeds_init1{i:i32}>
+! CHECK-DAG:  fir.type_info @_QMtyinfoTneeds_init_and_destroy nofinal : !fir.type<_QMtyinfoTneeds_init_and_destroy{x:!fir.box<!fir.heap<i32>>}>
+! CHECK-DAG:  fir.type_info @_QMtyinfoTneeds_all : !fir.type<_QMtyinfoTneeds_all{x:!fir.type<_QMtyinfoTneeds_final>,y:!fir.type<_QMtyinfoTneeds_init_and_destroy{x:!fir.box<!fir.heap<i32>>}>}>
+! CHECK-DAG:  fir.type_info @_QMtyinfoTinherits_final noinit extends !fir.type<_QMtyinfoTneeds_final> : !fir.type<_QMtyinfoTinherits_final>
+! CHECK-DAG:  fir.type_info @_QMtyinfoTinherits_init nodestroy nofinal extends !fir.type<_QMtyinfoTneeds_init1{i:i32}> : !fir.type<_QMtyinfoTinherits_init{i:i32}>
+! CHECK-DAG:  fir.type_info @_QMtyinfoTinherits_init_and_destroy nofinal extends !fir.type<_QMtyinfoTneeds_init_and_destroy{x:!fir.box<!fir.heap<i32>>}> : !fir.type<_QMtyinfoTinherits_init_and_destroy{x:!fir.box<!fir.heap<i32>>}>
+! CHECK-DAG:  fir.type_info @_QMtyinfoTinherits_all extends !fir.type<_QMtyinfoTneeds_all{x:!fir.type<_QMtyinfoTneeds_final>,y:!fir.type<_QMtyinfoTneeds_init_and_destroy{x:!fir.box<!fir.heap<i32>>}>}> : !fir.type<_QMtyinfoTinherits_all{x:!fir.type<_QMtyinfoTneeds_final>,y:!fir.type<_QMtyinfoTneeds_init_and_destroy{x:!fir.box<!fir.heap<i32>>}>}>
diff --git a/flang/test/Lower/HLFIR/where.f90 b/flang/test/Lower/HLFIR/where.f90
index 0f3efcc0b31ab5..bd4cc64de95094 100644
--- a/flang/test/Lower/HLFIR/where.f90
+++ b/flang/test/Lower/HLFIR/where.f90
@@ -79,8 +79,10 @@ subroutine where_cleanup()
 ! CHECK:    %[[VAL_6:.*]] = fir.call @_QPreturn_temporary_mask() fastmath<contract> : () -> !fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>
 ! CHECK:    fir.save_result %[[VAL_6]] to %[[VAL_1]] : !fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
 ! CHECK:    %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>)
-! CHECK:    %[[VAL_8:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
-! CHECK:    hlfir.yield %[[VAL_8]] : !fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>> cleanup {
+! CHECK:             %[[deref:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
+! CHECK:             %[[MustFree:.*]] = arith.constant false
+! CHECK:             %[[ResTemp:.*]] = hlfir.as_expr %[[deref]] move %[[MustFree]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>, i1) -> !hlfir.expr<?x!fir.logical<4>>
+! CHECK:             hlfir.yield %[[ResTemp]] : !hlfir.expr<?x!fir.logical<4>> cleanup {
 ! CHECK:        fir.freemem
 ! CHECK:    }
 ! CHECK:  } do {
@@ -88,8 +90,10 @@ subroutine where_cleanup()
 ! CHECK:      %[[VAL_14:.*]] = fir.call @_QPreturn_temporary_array() fastmath<contract> : () -> !fir.box<!fir.heap<!fir.array<?xf32>>>
 ! CHECK:      fir.save_result %[[VAL_14]] to %[[VAL_0]] : !fir.box<!fir.heap<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK:      %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
-! CHECK:      %[[VAL_16:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:      hlfir.yield %[[VAL_16]] : !fir.box<!fir.heap<!fir.array<?xf32>>> cleanup {
+! CHECK:               %[[deref:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:               %[[MustFree:.*]] = arith.constant false
+! CHECK:               %[[ResTemp:.*]] = hlfir.as_expr %[[deref]] move %[[MustFree]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, i1) -> !hlfir.expr<?xf32>
+! CHECK:               hlfir.yield %[[ResTemp]] : !hlfir.expr<?xf32> cleanup {
 ! CHECK:          fir.freemem
 ! CHECK:      }
 ! CHECK:    } to {
diff --git a/flang/test/Lower/Intrinsics/merge.f90 b/flang/test/Lower/Intrinsics/merge.f90
index 0ceb670f20fbd9..c8b6b3294b3280 100644
--- a/flang/test/Lower/Intrinsics/merge.f90
+++ b/flang/test/Lower/Intrinsics/merge.f90
@@ -14,7 +14,7 @@ function merge_test(o1, o2, mask)
 ! CHECK: %[[a2:.*]] = fir.load %[[arg4]] : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[a3:.*]] = fir.convert %[[a2]] : (!fir.logical<4>) -> i1
 ! CHECK: %[[a4:.*]] = arith.select %[[a3]], %[[a0_cast]], %[[a1_cast]] : !fir.ref<!fir.char<1>>
-! CHECK:  %{{.*}} = fir.convert %[[a4]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
+! CHECK:  %{{.*}} = fir.load %[[a4]] : !fir.ref<!fir.char<1>>
 end
 
 ! CHECK-LABEL: func @_QPmerge_test2(
diff --git a/flang/test/Lower/Intrinsics/storage_size-2.f90 b/flang/test/Lower/Intrinsics/storage_size-2.f90
index e784063c76c350..1195583dc87273 100644
--- a/flang/test/Lower/Intrinsics/storage_size-2.f90
+++ b/flang/test/Lower/Intrinsics/storage_size-2.f90
@@ -13,7 +13,10 @@ function return_char(l)
   print*, storage_size(return_char(n))
 ! CHECK: %[[val_16:.*]] = fir.call @_QPreturn_char(%[[res_addr:[^,]*]], %[[res_len:[^,]*]], {{.*}})
 ! CHECK: %[[res:.*]]:2 = hlfir.declare %[[res_addr]] typeparams %[[res_len]]
-! CHECK: %[[val_18:.*]] = fir.embox %[[res]]#1 typeparams %[[res_len]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
+! CHECK: %[[false:.*]] = arith.constant false
+! CHECK: %[[expr:.*]] = hlfir.as_expr %[[res]]#0 move %[[false]] : (!fir.boxchar<1>, i1) -> !hlfir.expr<!fir.char<1,?>>
+! CHECK: %[[assoc:.*]]:3 = hlfir.associate %[[expr]] typeparams %[[res_len]] {uniq_name = "adapt.valuebyref"} : (!hlfir.expr<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>, i1)
+! CHECK: %[[val_18:.*]] = fir.embox %[[assoc]]#1 typeparams %[[res_len]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 ! CHECK: %[[val_19:.*]] = fir.box_elesize %[[val_18]] : (!fir.box<!fir.char<1,?>>) -> i32
 ! CHECK: %[[val_20:.*]] = arith.constant 8 : i32
 ! CHECK: %[[val_21:.*]] = arith.muli %[[val_19]], %[[val_20]] : i32
diff --git a/flang/test/Lower/OpenACC/acc-enter-data.f90 b/flang/test/Lower/OpenACC/acc-enter-data.f90
index 5e18528dfc8299..93418a1fca3ec5 100644
--- a/flang/test/Lower/OpenACC/acc-enter-data.f90
+++ b/flang/test/Lower/OpenACC/acc-enter-data.f90
@@ -178,9 +178,8 @@ subroutine acc_enter_data
   !$acc enter data copyin(a(1:,1:5))
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
 !CHECK: %[[LB1:.*]] = arith.constant 0 : index
-!CHECK: %[[LBEXT:.*]] = arith.addi %c10{{.*}}, %[[ONE]] : index
-!CHECK: %[[UB1:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) upperbound(%[[UB1]] : index) stride(%[[ONE]] : index) startIdx(%c1{{.*}} : index)
+!CHECK: %[[UB1:.*]] = arith.subi %c10{{.*}}, %[[ONE]] : index
+!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) upperbound(%[[UB1]] : index) extent(%c10{{.*}} : index) stride(%[[ONE]] : index) startIdx(%c1{{.*}} : index)
 !CHECK: %[[LB2:.*]] = arith.constant 0 : index
 !CHECK: %[[UB2:.*]] = arith.constant 4 : index
 !CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) upperbound(%[[UB2]] : index) stride(%[[ONE]] : index) startIdx(%c1{{.*}} : index)
@@ -203,12 +202,10 @@ subroutine acc_enter_data
   !$acc enter data copyin(a(:,:))
 !CHECK: %[[LB:.*]] = arith.constant 0 : index
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
-!CHECK: %[[LBEXT:.*]] = arith.addi %c10{{.*}}, %[[ONE]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[ONE]] : index)
-!CHECK: %[[LBEXT:.*]] = arith.addi %c10{{.*}}, %[[ONE]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[ONE]] : index)
+!CHECK: %[[UB:.*]] = arith.subi %c10{{.*}}, %[[ONE]] : index
+!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%c10{{.*}} : index) stride(%[[ONE]] : index) startIdx(%[[ONE]] : index)
+!CHECK: %[[UB:.*]] = arith.subi %c10{{.*}}, %[[ONE]] : index
+!CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%c10{{.*}} : index) stride(%[[ONE]] : index) startIdx(%[[ONE]] : index)
 !FIR: %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "a(:,:)", structured = false}
 !HLFIR: %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "a(:,:)", structured = false}
 end subroutine acc_enter_data
@@ -285,7 +282,7 @@ subroutine acc_enter_data_dummy(a, b, n, m)
 !CHECK: %[[LB:.*]] = arith.subi %[[CONVERT_N]], %[[N_IDX]] : index
 !CHECK: %[[LBEXT:.*]] = arith.addi %[[EXT_B]], %[[N_IDX]] : index
 !CHECK: %[[UB:.*]] = arith.subi %[[LBEXT:.*]], %[[ONE]] : index
-!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[N_IDX]] : index)
+!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[EXT_B]] : index) stride(%[[ONE]] : index) startIdx(%[[N_IDX]] : index)
 !FIR: %[[CREATE1:.*]] = acc.create varPtr(%[[B]] : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND1]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(n:)", structured = false}
 !HLFIR: %[[CREATE1:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND1]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(n:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE1]] : !fir.ref<!fir.array<?xf32>>)
@@ -294,7 +291,7 @@ subroutine acc_enter_data_dummy(a, b, n, m)
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
 !CHECK: %[[LBEXT:.*]] = arith.addi %[[EXT_B]], %[[N_IDX]] : index
 !CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[N_IDX]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[N_IDX]] : index)
+!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[N_IDX]] : index) upperbound(%[[UB]] : index) extent(%[[EXT_B]] : index) stride(%[[ONE]] : index) startIdx(%[[N_IDX]] : index)
 !FIR: %[[CREATE1:.*]] = acc.create varPtr(%[[B]] : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND1]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(:)", structured = false}
 !HLFIR: %[[CREATE1:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND1]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE1]] : !fir.ref<!fir.array<?xf32>>)
@@ -329,7 +326,7 @@ subroutine acc_enter_data_non_default_lb()
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
 !CHECK: %[[LBEXT:.*]] = arith.addi %[[EXTENT_C10]], %[[BASELB]] : index
 !CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[BASELB]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[BASELB]] : index) upperbound(%[[UB]] : index) extent(%[[EXTENT_C10]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
 !FIR: %[[CREATE:.*]] = acc.create varPtr(%[[A]] : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(:)", structured = false}
 !HLFIR: %[[CREATE:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.ref<!fir.array<10xi32>>)
@@ -349,7 +346,7 @@ subroutine acc_enter_data_non_default_lb()
 !CHECK: %[[LB:.*]] = arith.subi %[[SECTIONLB]], %[[BASELB]] : index
 !CHECK: %[[LBEXT:.*]] = arith.addi %[[EXTENT_C10]], %[[BASELB]] : index
 !CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[EXTENT_C10]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
 !FIR: %[[CREATE:.*]] = acc.create varPtr(%[[A]] : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(4:)", structured = false}
 !HLFIR: %[[CREATE:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(4:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.ref<!fir.array<10xi32>>)
@@ -402,9 +399,8 @@ subroutine acc_enter_data_assumed(a, b, n, m)
 !CHECK: %[[C0:.*]] = arith.constant 0 : index
 !FIR: %[[DIMS1:.*]]:3 = fir.box_dims %[[A]], %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 !HLFIR: %[[DIMS1:.*]]:3 = fir.box_dims %[[DECLA]]#1, %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[DIMS1]]#1, %[[ONE]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[DIMS0]]#2 : index) startIdx(%[[ONE]] : index) {strideInBytes = true}
+!CHECK: %[[UB:.*]] = arith.subi %[[DIMS1]]#1, %[[ONE]] : index
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS1]]#1 : index) stride(%[[DIMS0]]#2 : index) startIdx(%[[ONE]] : index) {strideInBytes = true}
 !FIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[A]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECLA]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<?xf32>> {name = "a(:)", structured = false}
@@ -419,9 +415,8 @@ subroutine acc_enter_data_assumed(a, b, n, m)
 !CHECK: %[[C0:.*]] = arith.constant 0 : index
 !FIR: %[[DIMS1:.*]]:3 = fir.box_dims %[[A]], %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 !HLFIR: %[[DIMS1:.*]]:3 = fir.box_dims %[[DECLA]]#1, %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[DIMS1]]#1, %[[ONE]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[DIMS0]]#2 : index) startIdx(%[[ONE]] : index) {strideInBytes = true}
+!CHECK: %[[UB:.*]] = arith.subi %[[DIMS1]]#1, %[[ONE]] : index
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS1]]#1 : index) stride(%[[DIMS0]]#2 : index) startIdx(%[[ONE]] : index) {strideInBytes = true}
 !FIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[A]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECLA]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<?xf32>> {name = "a(2:)", structured = false}
@@ -465,9 +460,8 @@ subroutine acc_enter_data_assumed(a, b, n, m)
 !CHECK: %[[C0:.*]] = arith.constant 0 : index
 !FIR: %[[DIMS:.*]]:3 = fir.box_dims %[[A]], %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 !HLFIR: %[[DIMS:.*]]:3 = fir.box_dims %[[DECLA]]#1, %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[DIMS]]#1, %[[ONE]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[DIMS0]]#2 : index) startIdx(%[[ONE]] : index) {strideInBytes = true}
+!CHECK: %[[UB:.*]] = arith.subi %[[DIMS]]#1, %[[ONE]] : index
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS]]#1 : index) stride(%[[DIMS0]]#2 : index) startIdx(%[[ONE]] : index) {strideInBytes = true}
 !FIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[A]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECLA]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<?xf32>> {name = "a(n:)", structured = false}
@@ -619,7 +613,7 @@ subroutine acc_enter_data_allocatable()
 !CHECK: %[[DIMS2:.*]]:3 = fir.box_dims %[[BOX_A_1]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
 !CHECK: %[[LBEXT:.*]] = arith.addi %[[DIMS2:.*]]#1, %[[DIMS0]]#0 : index
 !CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[DIMS1]]#2 : index) startIdx(%[[DIMS0]]#0 : index) {strideInBytes = true}
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS2]]#1 : index) stride(%[[DIMS1]]#2 : index) startIdx(%[[DIMS0]]#0 : index) {strideInBytes = true}
 !CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX_A_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.heap<!fir.array<?xf32>> {name = "a(3:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.heap<!fir.array<?xf32>>)
@@ -720,9 +714,8 @@ subroutine acc_enter_data_derived_type()
 !HLFIR: %[[ARRAY_COORD:.*]] = hlfir.designate %[[DECLA]]#0{"array"}   shape %{{.*}} : (!fir.ref<!fir.type<_QFacc_enter_data_derived_typeTdt{data:f32,array:!fir.array<10xf32>}>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xf32>>
 !CHECK: %[[LB:.*]] = arith.constant 0 : index
 !CHECK: %[[C1:.*]] = arith.constant 1 : index
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[C10]], %[[C1]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[C1]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+!CHECK: %[[UB:.*]] = arith.subi %[[C10]], %[[C1]] : index
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[C10]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[ARRAY_COORD]] : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a%array(:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.ref<!fir.array<10xf32>>)
 
@@ -755,9 +748,8 @@ subroutine acc_enter_data_derived_type()
 !HLFIR: %[[ARRAY_COORD:.*]] = hlfir.designate %[[DECLA]]#0{"array"}   shape %{{.*}} : (!fir.ref<!fir.type<_QFacc_enter_data_derived_typeTdt{data:f32,array:!fir.array<10xf32>}>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xf32>>
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
 !CHECK: %[[LB:.*]] = arith.constant 1 : index
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[C10]], %[[ONE]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[ONE]] : index)
+!CHECK: %[[UB:.*]] = arith.subi %[[C10]], %[[ONE]] : index
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[C10]] : index) stride(%[[ONE]] : index) startIdx(%[[ONE]] : index)
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[ARRAY_COORD]] : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a%array(2:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.ref<!fir.array<10xf32>>)
 
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 0f9cfad1043be0..6c71a9bc72ff31 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -3,7 +3,7 @@
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK,FIR
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK,HLFIR
 
-! CHECK-LABEL: acc.private.recipe @"privatization_box_?xi32" : !fir.box<!fir.array<?xi32>> init {
+! CHECK-LABEL: acc.private.recipe @privatization_box_Uxi32 : !fir.box<!fir.array<?xi32>> init {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>):
 ! HLFIR:   %[[C0:.*]] = arith.constant 0 : index
 ! HLFIR:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
@@ -194,4 +194,4 @@ subroutine acc_private_assumed_shape(a, n)
 ! HLFIR: %[[DECL_A:.*]]:2 = hlfir.declare %arg0 {uniq_name = "_QFacc_private_assumed_shapeEa"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! HLFIR: %[[ADDR:.*]] = fir.box_addr %[[DECL_A]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! HLFIR: %[[PRIVATE:.*]] = acc.private varPtr(%[[ADDR]] : !fir.ref<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
-! HLFIR: acc.parallel private(@"privatization_box_?xi32" -> %[[PRIVATE]] : !fir.ref<!fir.array<?xi32>>) {
+! HLFIR: acc.parallel private(@privatization_box_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.array<?xi32>>) {
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index 13e6a575e4bd63..07979445394d92 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -3,7 +3,68 @@
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK,FIR
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK,HLFIR
 
-! CHECK-LABEL: acc.reduction.recipe @"reduction_max_ref_?xf32" : !fir.box<!fir.array<?xf32>> reduction_operator <max> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_ptr_Uxf32 : !fir.box<!fir.ptr<!fir.array<?xf32>>> reduction_operator <max> init {
+! CHECK: ^bb0(%{{.*}}: !fir.box<!fir.ptr<!fir.array<?xf32>>>):
+! CHECK: } combiner {
+! CHECK: ^bb0(%{{.*}}: !fir.box<!fir.ptr<!fir.array<?xf32>>>, %{{.*}}: !fir.box<!fir.ptr<!fir.array<?xf32>>>, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_heap_Uxf32 : !fir.box<!fir.heap<!fir.array<?xf32>>> reduction_operator <max> init {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.heap<!fir.array<?xf32>>>):
+! HLFIR:   %[[CST:.*]] = arith.constant -1.401300e-45 : f32
+! HLFIR:   %[[C0:.*]] = arith.constant 0 : index
+! HLFIR:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! HLFIR:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
+! HLFIR:   %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %2(%1) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+! HLFIR:   hlfir.assign %[[CST]] to %[[DECLARE]]#0 : f32, !fir.box<!fir.array<?xf32>>
+! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xf32>>
+! CHECK: } combiner {
+! HLFIR: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.heap<!fir.array<?xf32>>>, %[[ARG1:.*]]: !fir.box<!fir.heap<!fir.array<?xf32>>>, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index):
+! HLFIR:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! HLFIR:   %[[DES_V1:.*]] = hlfir.designate %[[ARG0]] (%[[ARG2]]:%[[ARG3]]:%[[ARG4]]) shape %[[SHAPE]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+! HLFIR:   %[[DES_V2:.*]] = hlfir.designate %[[ARG1]] (%[[ARG2]]:%[[ARG3]]:%[[ARG4]]) shape %[[SHAPE]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+! HLFIR:   %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+! HLFIR:   ^bb0(%[[IV:.*]]: index):
+! HLFIR:     %[[V1:.*]] = hlfir.designate %[[DES_V1]] (%[[IV]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! HLFIR:     %[[V2:.*]] = hlfir.designate %[[DES_V2]] (%[[IV]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! HLFIR:     %[[LOAD_V1:.*]] = fir.load %[[V1]] : !fir.ref<f32>
+! HLFIR:     %[[LOAD_V2:.*]] = fir.load %[[V2]] : !fir.ref<f32>
+! HLFIR:     %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] : f32
+! HLFIR:     %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD_V1]], %[[LOAD_V2]] : f32
+! HLFIR:     hlfir.yield_element %[[SELECT]] : f32
+! HLFIR:   }
+! HLFIR:   hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.box<!fir.heap<!fir.array<?xf32>>>
+! HLFIR:   acc.yield %[[ARG0]] : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>):
+! HLFIR:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! HLFIR:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
+! HLFIR:   %[[TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
+! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+! HLFIR:   hlfir.assign %c0{{.*}} to %[[DECLARE]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xi32>>
+! CHECK: } combiner {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
+! HLFIR:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! HLFIR:   %[[DES1:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+! HLFIR:   %[[DES2:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+! HLFIR:   %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+! HLFIR:   ^bb0(%[[IV:.*]]: index):
+! HLFIR:     %[[DES_V1:.*]] = hlfir.designate %[[DES1]] (%[[IV]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! HLFIR:     %[[DES_V2:.*]] = hlfir.designate %[[DES2]] (%[[IV]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! HLFIR:     %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<i32>
+! HLFIR:     %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<i32>
+! HLFIR:     %[[COMBINED:.*]] = arith.addi %[[LOAD_V1]], %[[LOAD_V2]] : i32
+! HLFIR:     hlfir.yield_element %[[COMBINED]] : i32
+! HLFIR:   }
+! HLFIR:   hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>>
+! HLFIR:   acc.yield %[[ARG0]] : !fir.box<!fir.array<?xi32>>
+! HLFIR: }
+
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>>):
 ! CHECK:   %[[INIT_VALUE:.*]] = arith.constant -1.401300e-45 : f32
 ! HLFIR:   %[[C0:.*]] = arith.constant 0 : index
@@ -15,12 +76,12 @@
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xf32>>
 ! CHECK: } combiner {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>>
-! HLFIR:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
-! HLFIR:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! HLFIR:   %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+! HLFIR:   %[[LEFT:.*]] = hlfir.designate %[[ARG0]] (%{{.*}}:%{{.*}}:%{{.*}}) shape %{{.*}} : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+! HLFIR:   %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] (%{{.*}}:%{{.*}}:%{{.*}}) shape %{{.*}} : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+! HLFIR:   %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
 ! HLFIR:   ^bb0(%{{.*}}: index):
-! HLFIR:     %[[DES_V1:.*]] = hlfir.designate %[[ARG0]] (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-! HLFIR:     %[[DES_V2:.*]] = hlfir.designate %[[ARG1]] (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+! HLFIR:     %[[DES_V1:.*]] = hlfir.designate %[[LEFT]] (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+! HLFIR:     %[[DES_V2:.*]] = hlfir.designate %[[RIGHT]] (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
 ! HLFIR:     %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<f32>
 ! HLFIR:     %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<f32>
 ! HLFIR:     %[[CMPF:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] : f32
@@ -31,7 +92,7 @@
 ! CHECK: acc.yield %[[ARG0]] : !fir.box<!fir.array<?xf32>>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @"reduction_add_ref_?xi32" : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>):
 ! HLFIR:   %[[INIT_VALUE:.*]] = arith.constant 0 : i32
 ! HLFIR:   %[[C0:.*]] = arith.constant 0 : index
@@ -43,12 +104,12 @@
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xi32>>
 ! CHECK: } combiner {
 ! CHECK: ^bb0(%[[V1:.*]]: !fir.box<!fir.array<?xi32>>, %[[V2:.*]]: !fir.box<!fir.array<?xi32>>
-! HLFIR:   %[[BOX_DIMS]]:3 = fir.box_dims %[[V1]], %{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-! HLFIR:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! HLFIR:   %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+! HLFIR:   %[[LEFT:.*]] = hlfir.designate %[[ARG0]] (%{{.*}}:%{{.*}}:%{{.*}}) shape %{{.*}} : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+! HLFIR:   %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] (%{{.*}}:%{{.*}}:%{{.*}}) shape %{{.*}} : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+! HLFIR:   %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
 ! HLFIR:   ^bb0(%{{.*}}: index):
-! HLFIR:     %[[DES_V1:.*]] = hlfir.designate %[[V1]] (%{{.*}})  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! HLFIR:     %[[DES_V2:.*]] = hlfir.designate %[[V2]] (%{{.*}})  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! HLFIR:     %[[DES_V1:.*]] = hlfir.designate %[[LEFT]] (%{{.*}})  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! HLFIR:     %[[DES_V2:.*]] = hlfir.designate %[[RIGHT]] (%{{.*}})  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 ! HLFIR:     %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<i32>
 ! HLFIR:     %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<i32>
 ! HLFIR:     %[[COMBINED:.*]] = arith.addi %[[LOAD_V1]], %[[LOAD_V2]] : i32
@@ -226,7 +287,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_section_ext100_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant -1.401300e-45 : f32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xf32>
@@ -241,7 +302,10 @@
 ! CHECK:   }
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index):
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB0:.*]] = arith.constant 99 : index
+! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
 ! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
 ! CHECK:     %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
 ! CHECK:     %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
@@ -271,7 +335,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<f32>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_max_section_ext100xext10_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init {
 ! CHECK: ^bb0(%arg0: !fir.ref<!fir.array<100x10xi32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant -2147483648 : i32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xi32>
@@ -279,8 +343,14 @@
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xi32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index, %[[LB1:.*]]: index, %[[UB1:.*]]: index, %[[STEP1:.*]]: index):
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB0:.*]] = arith.constant 9 : index
+! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
 ! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
+! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
+! CHECK:     %[[UB1:.*]] = arith.constant 99 : index
+! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
 ! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
 ! CHECK:       %[[COORD1:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[IV0]], %[[IV1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
 ! CHECK:       %[[COORD2:.*]] = fir.coordinate_of %[[ARG1:.*]], %[[IV0]], %[[IV1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
@@ -311,7 +381,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_min_section_ext100xext10_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10xf32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant 3.40282347E+38 : f32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xf32>
@@ -319,8 +389,14 @@
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xf32>>, !fir.ref<!fir.array<100x10xf32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xf32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index, %[[LB1:.*]]: index, %[[UB1:.*]]: index, %[[STEP1:.*]]: index):
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xf32>>):
+! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB0:.*]] = arith.constant 9 : index
+! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
 ! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
+! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
+! CHECK:     %[[UB1:.*]] = arith.constant 99 : index
+! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
 ! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
 ! CHECK:       %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]], %[[IV1]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
 ! CHECK:       %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]], %[[IV1]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
@@ -351,7 +427,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<f32>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_min_section_ext100_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant 2147483647 : i32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
@@ -359,7 +435,10 @@
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index):
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB0:.*]] = arith.constant 99 : index
+! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
 ! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
 ! CHECK:     %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
 ! CHECK:     %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
@@ -405,7 +484,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<f32>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_mul_section_ext100_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant 1 : i32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
@@ -413,8 +492,11 @@
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index):
-! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:   %[[LB:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB:.*]] = arith.constant 99 : index
+! CHECK:   %[[STEP:.*]] = arith.constant 1 : index
+! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
 ! CHECK:     %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
 ! CHECK:     %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
 ! CHECK:     %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
@@ -441,7 +523,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_ext100_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xf32>
@@ -449,8 +531,11 @@
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index):
-! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK:   %[[LB:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB:.*]] = arith.constant 99 : index
+! CHECK:   %[[STEP:.*]] = arith.constant 1 : index
+! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
 ! CHECK:   %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
 ! CHECK:   %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
 ! CHECK:   %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32>
@@ -477,7 +562,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<f32>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_ext100xext10xext2_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10x2xi32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant 0 : i32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10x2xi32>
@@ -485,9 +570,18 @@
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10x2xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<100x10x2xi32>>, !fir.ref<!fir.array<100x10x2xi32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10x2xi32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index, %[[LB1:.*]]: index, %[[UB1:.*]]: index, %[[STEP1:.*]]: index, %[[LB2:.*]]: index, %[[UB2:.*]]: index, %[[STEP2:.*]]: index):
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10x2xi32>>):
+! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB0:.*]] = arith.constant 1 : index
+! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
 ! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
+! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
+! CHECK:     %[[UB1:.*]] = arith.constant 9 : index
+! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
 ! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
+! CHECK:       %[[LB2:.*]] = arith.constant 0 : index
+! CHECK:       %[[UB2:.*]] = arith.constant 99 : index
+! CHECK:       %[[STEP2:.*]] = arith.constant 1 : index
 ! CHECK:       fir.do_loop %[[IV2:.*]] = %[[LB2]] to %[[UB2]] step %[[STEP2]] {
 ! CHECK:         %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]], %[[IV1]], %[[IV2]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
 ! CHECK:         %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]], %[[IV1]], %[[IV2]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
@@ -501,7 +595,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10x2xi32>>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_ext100xext10_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10xi32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant 0 : i32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xi32>
@@ -509,8 +603,14 @@
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xi32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index, %[[LB1:.*]]: index, %[[UB1:.*]]: index, %[[STEP1:.*]]: index):
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB0:.*]] = arith.constant 9 : index
+! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
 ! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
+! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
+! CHECK:     %[[UB1:.*]] = arith.constant 99 : index
+! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
 ! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
 ! CHECK:       %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]], %[[IV1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
 ! CHECK:       %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]], %[[IV1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
@@ -523,7 +623,7 @@
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xi32>>
 ! CHECK: }
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_ext100_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
 ! CHECK:   %[[INIT:.*]] = arith.constant 0 : i32
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
@@ -531,8 +631,11 @@
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! HFLIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
 ! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[STEP0:.*]]: index):
-! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:   %[[LB:.*]] = arith.constant 0 : index
+! CHECK:   %[[UB:.*]] = arith.constant 99 : index
+! CHECK:   %[[STEP:.*]] = arith.constant 1 : index
+! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
 ! CHECK:     %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
 ! CHECK:     %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
 ! CHECK:     %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
@@ -591,7 +694,7 @@ subroutine acc_reduction_add_int_array_1d(a, b)
 ! HLFIR:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! FIR:         %[[RED_B:.*]] = acc.reduction varPtr(%[[B]] : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
 ! HLFIR:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_add_ref_100xi32 -> %[[RED_B]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK:       acc.loop reduction(@reduction_add_section_ext100_ref_100xi32 -> %[[RED_B]] : !fir.ref<!fir.array<100xi32>>)
 
 subroutine acc_reduction_add_int_array_2d(a, b)
   integer :: a(100, 10), b(100, 10)
@@ -610,7 +713,7 @@ subroutine acc_reduction_add_int_array_2d(a, b)
 ! HLFIR:       %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! FIR:         %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100x10xi32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xi32>> {name = "b"}
 ! HLFIR:       %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10xi32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xi32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_add_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>) {
+! CHECK:       acc.loop reduction(@reduction_add_section_ext100xext10_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>) {
 ! CHECK: } attributes {collapse = 2 : i64}
 
 subroutine acc_reduction_add_int_array_3d(a, b)
@@ -632,7 +735,7 @@ subroutine acc_reduction_add_int_array_3d(a, b)
 ! HLFIR: %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! FIR:   %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100x10x2xi32>>) bounds(%{{.*}}, %{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10x2xi32>> {name = "b"}
 ! HLFIR: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10x2xi32>>) bounds(%{{.*}}, %{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10x2xi32>> {name = "b"}
-! CHECK: acc.loop reduction(@reduction_add_ref_100x10x2xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10x2xi32>>)
+! CHECK: acc.loop reduction(@reduction_add_section_ext100xext10xext2_ref_100x10x2xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10x2xi32>>)
 ! CHECK: } attributes {collapse = 3 : i64}
 
 subroutine acc_reduction_add_float(a, b)
@@ -667,7 +770,7 @@ subroutine acc_reduction_add_float_array_1d(a, b)
 ! HLFIR: %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! FIR:   %[[RED_B:.*]] = acc.reduction varPtr(%[[B]] : !fir.ref<!fir.array<100xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
 ! HLFIR: %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_add_ref_100xf32 -> %[[RED_B]] : !fir.ref<!fir.array<100xf32>>)
+! CHECK:       acc.loop reduction(@reduction_add_section_ext100_ref_100xf32 -> %[[RED_B]] : !fir.ref<!fir.array<100xf32>>)
 
 subroutine acc_reduction_mul_int(a, b)
   integer :: a(100)
@@ -701,7 +804,7 @@ subroutine acc_reduction_mul_int_array_1d(a, b)
 ! HLFIR:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! FIR:         %[[RED_B:.*]] = acc.reduction varPtr(%[[B]] : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
 ! HLFIR:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_mul_ref_100xi32 -> %[[RED_B]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK:       acc.loop reduction(@reduction_mul_section_ext100_ref_100xi32 -> %[[RED_B]] : !fir.ref<!fir.array<100xi32>>)
 
 subroutine acc_reduction_mul_float(a, b)
   real :: a(100), b
@@ -735,7 +838,7 @@ subroutine acc_reduction_mul_float_array_1d(a, b)
 ! HLFIR:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! FIR:         %[[RED_B:.*]] = acc.reduction varPtr(%[[B]] : !fir.ref<!fir.array<100xf32>>) bounds(%2) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
 ! HLFIR:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_mul_ref_100xf32 -> %[[RED_B]] : !fir.ref<!fir.array<100xf32>>)
+! CHECK:       acc.loop reduction(@reduction_mul_section_ext100_ref_100xf32 -> %[[RED_B]] : !fir.ref<!fir.array<100xf32>>)
 
 subroutine acc_reduction_min_int(a, b)
   integer :: a(100)
@@ -769,7 +872,7 @@ subroutine acc_reduction_min_int_array_1d(a, b)
 ! HLFIR: %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! FIR:   %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100xi32>>) bounds(%2) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
 ! HLFIR: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
-! CHECK: acc.loop reduction(@reduction_min_ref_100xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK: acc.loop reduction(@reduction_min_section_ext100_ref_100xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100xi32>>)
 
 subroutine acc_reduction_min_float(a, b)
   real :: a(100), b
@@ -805,7 +908,7 @@ subroutine acc_reduction_min_float_array2d(a, b)
 ! HLFIR: %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! FIR:   %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100x10xf32>>) bounds(%3, %5) -> !fir.ref<!fir.array<100x10xf32>> {name = "b"}
 ! HLFIR: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xf32>> {name = "b"}
-! CHECK: acc.loop reduction(@reduction_min_ref_100x10xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xf32>>)
+! CHECK: acc.loop reduction(@reduction_min_section_ext100xext10_ref_100x10xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xf32>>)
 ! CHECK: attributes {collapse = 2 : i64}
 
 subroutine acc_reduction_max_int(a, b)
@@ -842,7 +945,7 @@ subroutine acc_reduction_max_int_array2d(a, b)
 ! HLFIR: %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! FIR:   %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100x10xi32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xi32>> {name = "b"}
 ! HLFIR: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10xi32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xi32>> {name = "b"}
-! CHECK: acc.loop reduction(@reduction_max_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>)
+! CHECK: acc.loop reduction(@reduction_max_section_ext100xext10_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>)
 
 subroutine acc_reduction_max_float(a, b)
   real :: a(100), b
@@ -876,7 +979,7 @@ subroutine acc_reduction_max_float_array1d(a, b)
 ! HLFIR:       %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! FIR:         %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[ARG1]] : !fir.ref<!fir.array<100xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
 ! HLFIR:       %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_max_ref_100xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100xf32>>) {
+! CHECK:       acc.loop reduction(@reduction_max_section_ext100_ref_100xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100xf32>>) {
 
 subroutine acc_reduction_iand()
   integer :: i
@@ -1017,7 +1120,7 @@ subroutine acc_reduction_add_static_slice(a)
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! FIR:   %[[RED:.*]] = acc.reduction varPtr(%[[ARG0]] : !fir.ref<!fir.array<100xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<100xi32>> {name = "a(11:20)"}
 ! HLFIR: %[[RED:.*]] = acc.reduction varPtr(%[[DECLARG0]]#1 : !fir.ref<!fir.array<100xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<100xi32>> {name = "a(11:20)"}
-! CHECK: acc.parallel reduction(@reduction_add_ref_100xi32 -> %[[RED]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK: acc.parallel reduction(@reduction_add_section_lb10.ub19_ref_100xi32 -> %[[RED]] : !fir.ref<!fir.array<100xi32>>)
 
 subroutine acc_reduction_add_dynamic_extent_add(a)
   integer :: a(:)
@@ -1029,7 +1132,7 @@ subroutine acc_reduction_add_dynamic_extent_add(a)
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"})
 ! HLFIR: %[[DECLARG0:.*]]:2 = hlfir.declare %[[ARG0]]
 ! HLFIR: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
-! HLFIR: acc.parallel reduction(@"reduction_add_ref_?xi32" -> %[[RED:.*]] : !fir.ref<!fir.array<?xi32>>)
+! HLFIR: acc.parallel reduction(@reduction_add_box_Uxi32 -> %[[RED:.*]] : !fir.ref<!fir.array<?xi32>>)
 
 subroutine acc_reduction_add_dynamic_extent_max(a)
   real :: a(:)
@@ -1041,4 +1144,48 @@ subroutine acc_reduction_add_dynamic_extent_max(a)
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"})
 ! HLFIR: %[[DECLARG0:.*]]:2 = hlfir.declare %[[ARG0]]
 ! HLFIR: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xf32>> {name = "a"}
-! HLFIR: acc.parallel reduction(@"reduction_max_ref_?xf32" -> %[[RED]] : !fir.ref<!fir.array<?xf32>>) {
+! HLFIR: acc.parallel reduction(@reduction_max_box_Uxf32 -> %[[RED]] : !fir.ref<!fir.array<?xf32>>) {
+
+subroutine acc_reduction_add_dynamic_extent_add_with_section(a)
+  integer :: a(:)
+  !$acc parallel reduction(+:a(2:4))
+  !$acc end parallel
+end subroutine
+
+! CHECK-LABEL: func.func @_QPacc_reduction_add_dynamic_extent_add_with_section(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"})
+! HLFIR: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFacc_reduction_add_dynamic_extent_add_with_sectionEa"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! HLFIR: %[[BOUND:.*]] = acc.bounds lowerbound(%c1{{.*}} : index) upperbound(%c3{{.*}} : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}} : index) {strideInBytes = true}
+! HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECL]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! HLFIR: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<?xi32>> {name = "a(2:4)"}
+! HLFIR: acc.parallel reduction(@reduction_add_section_lb1.ub3_box_Uxi32 -> %[[RED]] : !fir.ref<!fir.array<?xi32>>)
+
+subroutine acc_reduction_add_allocatable(a)
+  real, allocatable :: a(:)
+  !$acc parallel reduction(max:a)
+  !$acc end parallel
+end subroutine
+
+! CHECK-LABEL: func.func @_QPacc_reduction_add_allocatable(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "a"})
+! HLFIR: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_reduction_add_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! HLFIR: %[[BOX:.*]] = fir.load %[[DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! HLFIR: %[[BOUND:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}}#0 : index) {strideInBytes = true}
+! HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+! HLFIR: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>)   bounds(%6) -> !fir.heap<!fir.array<?xf32>> {name = "a"}
+! HLFIR: acc.parallel reduction(@reduction_max_box_heap_Uxf32 -> %[[RED]] : !fir.heap<!fir.array<?xf32>>)
+
+subroutine acc_reduction_add_pointer_array(a)
+  real, pointer :: a(:)
+  !$acc parallel reduction(max:a)
+  !$acc end parallel
+end subroutine
+
+! CHECK-LABEL: func.func @_QPacc_reduction_add_pointer_array(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a"})
+! HLFIR: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_reduction_add_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! HLFIR: %[[BOX:.*]] = fir.load %[[DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! HLFIR: %[[BOUND:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}}#0 : index) {strideInBytes = true}
+! HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! HLFIR: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.ptr<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.ptr<!fir.array<?xf32>> {name = "a"}
+! HLFIR: acc.parallel reduction(@reduction_max_box_ptr_Uxf32 -> %[[RED]] : !fir.ptr<!fir.array<?xf32>>)
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-update.f90 b/flang/test/Lower/OpenMP/FIR/atomic-update.f90
index 0c3e25cb483172..d0185d2f3b14df 100644
--- a/flang/test/Lower/OpenMP/FIR/atomic-update.f90
+++ b/flang/test/Lower/OpenMP/FIR/atomic-update.f90
@@ -69,8 +69,8 @@ program OmpAtomicUpdate
 !CHECK:  ^bb0(%[[ARG:.*]]: i32):
 !CHECK:    %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref<i32>
 !CHECK:    %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref<i32>
-!CHECK:    %{{.*}} = arith.cmpi sgt, %[[LOADED_X]], %[[ARG]] : i32
-!CHECK:    %{{.*}} = arith.select %{{.*}}, %[[LOADED_X]], %[[ARG]] : i32
+!CHECK:    %{{.*}} = arith.cmpi sgt, %[[ARG]], %[[LOADED_X]] : i32
+!CHECK:    %{{.*}} = arith.select %{{.*}}, %[[ARG]], %[[LOADED_X]] : i32
 !CHECK:    %{{.*}} = arith.cmpi sgt, %{{.*}}, %[[LOADED_Z]] : i32
 !CHECK:    %[[RESULT:.*]] = arith.select %{{.*}}, %{{.*}}, %[[LOADED_Z]] : i32
 !CHECK:    omp.yield(%[[RESULT]] : i32)
@@ -84,7 +84,7 @@ program OmpAtomicUpdate
     !$omp atomic relaxed update hint(omp_sync_hint_uncontended)
         x = x - 1
     !$omp atomic update relaxed 
-        y = max(x, y, z)
+        y = max(y, x, z)
     !$omp atomic relaxed hint(omp_sync_hint_contended)
         z = z + x
 
diff --git a/flang/test/Lower/OpenMP/FIR/copyin.f90 b/flang/test/Lower/OpenMP/FIR/copyin.f90
index ddfa0ea0914628..2b3f382240958c 100644
--- a/flang/test/Lower/OpenMP/FIR/copyin.f90
+++ b/flang/test/Lower/OpenMP/FIR/copyin.f90
@@ -85,8 +85,7 @@ subroutine copyin_scalar_array()
 ! CHECK:           }
 ! CHECK:           fir.array_merge_store %[[VAL_16]], %[[VAL_35:.*]] to %[[VAL_14]] : !fir.array<10x!fir.char<1,5>>, !fir.array<10x!fir.char<1,5>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>
 ! CHECK:           omp.barrier
-! CHECK:           %[[VAL_36:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:           %[[VAL_37:.*]] = fir.emboxchar %[[VAL_36]], %[[VAL_1]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_37:.*]] = fir.emboxchar %[[VAL_7]], %[[VAL_1]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 ! CHECK:           %[[VAL_38:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.array<10x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:           %[[VAL_39:.*]] = fir.emboxchar %[[VAL_38]], %[[VAL_4]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 ! CHECK:           fir.call @_QPsub2(%[[VAL_37]], %[[VAL_39]]) {{.*}}: (!fir.boxchar<1>, !fir.boxchar<1>) -> ()
diff --git a/flang/test/Lower/OpenMP/FIR/private-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/private-commonblock.f90
index 767458e3effbf0..6e98b917cf87fc 100644
--- a/flang/test/Lower/OpenMP/FIR/private-commonblock.f90
+++ b/flang/test/Lower/OpenMP/FIR/private-commonblock.f90
@@ -34,8 +34,7 @@ subroutine private_common
 !CHECK: %[[val_11:.*]] = fir.coordinate_of %[[val_10]], %[[val_c49]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
 !CHECK: %[[val_12:.*]] = fir.convert %[[val_11]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
 !CHECK: %[[val_c5_0:.*]] = arith.constant 5 : index
-!CHECK: %[[val_13:.*]] = fir.convert %[[val_9]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_14:.*]] = fir.emboxchar %[[val_13]], %[[val_c5]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+!CHECK: %[[val_14:.*]] = fir.emboxchar %[[val_9]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 !CHECK: %[[val_15:.*]] = fir.convert %[[val_12]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
 !CHECK: %[[val_16:.*]] = fir.emboxchar %[[val_15]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 !CHECK: fir.call @_QPsub1(%[[val_3]], %[[val_6]], %[[val_14]], %[[val_16]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
@@ -44,15 +43,13 @@ subroutine private_common
 !CHECK: %[[val_22:.*]] = fir.alloca !fir.array<10xf32> {bindc_name = "b", pinned, uniq_name = "_QFprivate_clause_commonblockEb"}
 !CHECK: %[[val_23:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblockEc"}
 !CHECK: %[[val_24:.*]] = fir.alloca !fir.array<5x!fir.char<1,5>> {bindc_name = "d", pinned, uniq_name = "_QFprivate_clause_commonblockEd"}
-!CHECK: %[[val_25:.*]] = fir.convert %[[val_23]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_26:.*]] = fir.emboxchar %[[val_25]], %[[val_c5]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+!CHECK: %[[val_26:.*]] = fir.emboxchar %[[val_23]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 !CHECK: %[[val_27:.*]] = fir.convert %[[val_24]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
 !CHECK: %[[val_28:.*]] = fir.emboxchar %[[val_27]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 !CHECK: fir.call @_QPsub2(%[[val_21]], %[[val_22]], %[[val_26]], %[[val_28]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
 !CHECK: omp.terminator
 !CHECK: }
-!CHECK: %[[val_17:.*]] = fir.convert %[[val_9]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_18:.*]] = fir.emboxchar %[[val_17]], %[[val_c5]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+!CHECK: %[[val_18:.*]] = fir.emboxchar %[[val_9]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 !CHECK: %[[val_19:.*]] = fir.convert %[[val_12]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
 !CHECK: %[[val_20:.*]] = fir.emboxchar %[[val_19]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 !CHECK: fir.call @_QPsub3(%[[val_3]], %[[val_6]], %[[val_18]], %[[val_20]]) fastmath<contract> : {{.*}}
diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90
new file mode 100644
index 00000000000000..831c7021d6b2ed
--- /dev/null
+++ b/flang/test/Lower/OpenMP/array-bounds.f90
@@ -0,0 +1,119 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes HOST
+
+
+!HOST-LABEL:  func.func @_QPread_write_section() {
+!HOST:  %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFread_write_sectionEi"}
+!HOST:  %[[READ:.*]] = fir.address_of(@_QFread_write_sectionEsp_read) : !fir.ref<!fir.array<10xi32>>
+!HOST:  %[[C10:.*]] = arith.constant 10 : index
+!HOST:  %[[READ_SHAPE:.*]] = fir.shape %[[C10]] : (index) -> !fir.shape<1>
+!HOST:  %[[READ_DECL:.*]]:2 = hlfir.declare %[[READ]](%[[READ_SHAPE]]) {uniq_name = "_QFread_write_sectionEsp_read"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+!HOST:  %[[WRITE:.*]] = fir.address_of(@_QFread_write_sectionEsp_write) : !fir.ref<!fir.array<10xi32>>
+!HOST:  %[[C10:.*]] = arith.constant 10 : index
+!HOST:  %[[WRITE_SHAPE:.*]] = fir.shape %[[C10]] : (index) -> !fir.shape<1>
+!HOST:  %[[WRITE_DECL:.*]]:2 = hlfir.declare %[[WRITE]](%[[WRITE_SHAPE]]) {uniq_name = "_QFread_write_sectionEsp_write"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+
+!HOST:  %[[C1:.*]] = arith.constant 1 : index
+!HOST:  %[[C2:.*]] = arith.constant 1 : index
+!HOST:  %[[C3:.*]] = arith.constant 4 : index
+!HOST:  %[[BOUNDS0:.*]] = omp.bounds   lower_bound(%[[C2]] : index) upper_bound(%[[C3]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
+!HOST:  %[[MAP0:.*]] = omp.map_info var_ptr(%[[READ_DECL]]#1 : !fir.ref<!fir.array<10xi32>>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_read(2:5)"}
+!HOST:  %[[C4:.*]] = arith.constant 1 : index
+!HOST:  %[[C5:.*]] = arith.constant 1 : index
+!HOST:  %[[C6:.*]] = arith.constant 4 : index
+!HOST:  %[[BOUNDS1:.*]] = omp.bounds   lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index)
+!HOST:  %[[MAP1:.*]] = omp.map_info var_ptr(%[[WRITE_DECL]]#1 : !fir.ref<!fir.array<10xi32>>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_write(2:5)"}
+!HOST:  omp.target   map_entries(%[[MAP0]], %[[MAP1]] : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) {
+
+subroutine read_write_section()
+    integer :: sp_read(10) = (/1,2,3,4,5,6,7,8,9,10/)
+    integer :: sp_write(10) = (/0,0,0,0,0,0,0,0,0,0/)
+
+!$omp target map(tofrom:sp_read(2:5)) map(tofrom:sp_write(2:5))
+    do i = 2, 5
+        sp_write(i) = sp_read(i)
+    end do
+!$omp end target
+end subroutine read_write_section
+
+
+module assumed_array_routines
+    contains
+
+!HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array(
+!HOST-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
+!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!HOST: %[[C0:.*]] = arith.constant 1 : index
+!HOST: %[[C1:.*]] = arith.constant 0 : index
+!HOST: %[[C2:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#1, %[[C1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+!HOST: %[[C3:.*]] = arith.constant 1 : index
+!HOST: %[[C4:.*]] = arith.constant 4 : index
+!HOST: %[[BOUNDS:.*]] = omp.bounds   lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) stride(%[[C2]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true}
+!HOST: %[[ADDROF:.*]] = fir.box_addr %[[ARG0_DECL]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+!HOST: %[[MAP:.*]] = omp.map_info var_ptr(%[[ADDROF]] : !fir.ref<!fir.array<?xi32>>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
+!HOST: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<?xi32>>) {
+    subroutine assumed_shape_array(arr_read_write)
+            integer, intent(inout) :: arr_read_write(:)
+
+        !$omp target map(tofrom:arr_read_write(2:5))
+            do i = 2, 5
+                arr_read_write(i) = i
+            end do
+        !$omp end target
+        end subroutine assumed_shape_array
+
+
+!HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_size_array(
+!HOST-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
+!HOST: %[[UNDEF:.*]] = fir.undefined index
+!HOST: %[[ARG0_SHAPE:.*]] = fir.shape %[[UNDEF]] : (index) -> !fir.shape<1>
+!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]](%[[ARG0_SHAPE]]) {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEarr_read_write"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+!HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"}
+!HOST: %[[C0:.*]] = arith.constant 1 : index
+!HOST: %[[C1:.*]] = arith.constant 1 : index
+!HOST: %[[C2:.*]] = arith.constant 4 : index
+!HOST: %[[BOUNDS:.*]]  = omp.bounds   lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) stride(%[[C0]] : index) start_idx(%[[C0]] : index)
+!HOST: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG0_DECL]]#1 : !fir.ref<!fir.array<?xi32>>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
+!HOST: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<?xi32>>) {
+        subroutine assumed_size_array(arr_read_write)
+            integer, intent(inout) :: arr_read_write(*)
+
+        !$omp target map(tofrom:arr_read_write(2:5))
+            do i = 2, 5
+                arr_read_write(i) = i
+            end do
+        !$omp end target
+        end subroutine assumed_size_array
+    end module assumed_array_routines
+
+
+!HOST-LABEL:func.func @_QPcall_assumed_shape_and_size_array() {
+!HOST: %[[C20:.*]] = arith.constant 20 : index
+!HOST: %[[READ_WRITE:.*]] = fir.alloca !fir.array<20xi32> {bindc_name = "arr_read_write", uniq_name = "_QFcall_assumed_shape_and_size_arrayEarr_read_write"}
+!HOST: %[[SHAPE_READ_WRITE:.*]] = fir.shape %[[C20]] : (index) -> !fir.shape<1>
+!HOST: %[[READ_WRITE_DECL:.*]]:2 = hlfir.declare %[[READ_WRITE]](%[[SHAPE_READ_WRITE]]) {uniq_name = "_QFcall_assumed_shape_and_size_arrayEarr_read_write"} : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<20xi32>>, !fir.ref<!fir.array<20xi32>>)
+!HOST: %[[LB:.*]] = arith.constant 1 : index
+!HOST: %[[UB:.*]] = arith.constant 10 : index
+!HOST: %[[STEP:.*]] = arith.constant 1 : index
+!HOST: %[[SHAPE_VAL10:.*]] = arith.constant 10 : index
+!HOST: %[[SHAPE:.*]] = fir.shape %[[SHAPE_VAL10]] : (index) -> !fir.shape<1>
+!HOST: %[[READ_WRITE_DESIGNATE:.*]] = hlfir.designate %[[READ_WRITE_DECL]]#0 (%[[LB]]:%[[UB]]:%[[STEP]])  shape %[[SHAPE]] : (!fir.ref<!fir.array<20xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+!HOST: %[[READ_WRITE_EMBOX:.*]] = fir.embox %[[READ_WRITE_DESIGNATE]](%[[SHAPE]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
+!HOST: %[[ARG0:.*]] = fir.convert %[[READ_WRITE_EMBOX]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<!fir.array<?xi32>>
+!HOST: fir.call @_QMassumed_array_routinesPassumed_shape_array(%[[ARG0]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>) -> ()
+!HOST: %[[LB:.*]] = arith.constant 10 : index
+!HOST: %[[UB:.*]] = arith.constant 20 : index
+!HOST: %[[STEP:.*]] = arith.constant 1 : index
+!HOST: %[[SHAPE_VAL11:.*]] = arith.constant 11 : index
+!HOST: %[[SHAPE:.*]] = fir.shape %[[SHAPE_VAL11]] : (index) -> !fir.shape<1>
+!HOST: %[[READ_WRITE_EMBOX:.*]] = hlfir.designate %[[READ_WRITE_DECL]]#0 (%[[LB]]:%[[UB]]:%[[STEP]])  shape %[[SHAPE]] : (!fir.ref<!fir.array<20xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<11xi32>>
+!HOST: %[[ARG1:.*]] = fir.convert %[[READ_WRITE_EMBOX]] : (!fir.ref<!fir.array<11xi32>>) -> !fir.ref<!fir.array<?xi32>>
+!HOST: fir.call @_QMassumed_array_routinesPassumed_size_array(%[[ARG1]]) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
+!HOST: return
+!HOST:}
+
+subroutine call_assumed_shape_and_size_array
+    use assumed_array_routines
+    integer :: arr_read_write(20)
+    call assumed_shape_array(arr_read_write(1:10))
+    call assumed_size_array(arr_read_write(10:20))
+end subroutine call_assumed_shape_and_size_array
diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90
new file mode 100644
index 00000000000000..032009add31535
--- /dev/null
+++ b/flang/test/Lower/OpenMP/if-clause.f90
@@ -0,0 +1,505 @@
+! This test checks lowering of OpenMP IF clauses.
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+program main
+  integer :: i
+
+  ! TODO When they are supported, add tests for:
+  ! - DISTRIBUTE PARALLEL DO
+  ! - DISTRIBUTE PARALLEL DO SIMD
+  ! - DISTRIBUTE SIMD
+  ! - PARALLEL SECTIONS
+  ! - PARALLEL WORKSHARE
+  ! - TARGET PARALLEL
+  ! - TARGET TEAMS DISTRIBUTE
+  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO
+  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD
+  ! - TARGET TEAMS DISTRIBUTE SIMD
+  ! - TARGET UPDATE
+  ! - TASKLOOP
+  ! - TASKLOOP SIMD
+  ! - TEAMS DISTRIBUTE
+  ! - TEAMS DISTRIBUTE PARALLEL DO
+  ! - TEAMS DISTRIBUTE PARALLEL DO SIMD
+  ! - TEAMS DISTRIBUTE SIMD
+
+  ! ----------------------------------------------------------------------------
+  ! DO SIMD
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.simdloop
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp do simd
+  do i = 1, 10
+  end do
+  !$omp end do simd
+
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp do simd if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end do simd
+
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp do simd if(simd: .true.)
+  do i = 1, 10
+  end do
+  !$omp end do simd
+
+  ! ----------------------------------------------------------------------------
+  ! PARALLEL
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.parallel
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp parallel
+  i = 10
+  !$omp end parallel
+
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  !$omp parallel if(.true.)
+  i = 10
+  !$omp end parallel
+
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  !$omp parallel if(parallel: .true.)
+  i = 10
+  !$omp end parallel
+
+  ! ----------------------------------------------------------------------------
+  ! PARALLEL DO
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.parallel
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp parallel do
+  do i = 1, 10
+  end do
+  !$omp end parallel do
+
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  !$omp parallel do if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end parallel do
+
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  !$omp parallel do if(parallel: .true.)
+  do i = 1, 10
+  end do
+  !$omp end parallel do
+
+  ! ----------------------------------------------------------------------------
+  ! PARALLEL DO SIMD
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.parallel
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.simdloop
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp parallel do simd
+  do i = 1, 10
+  end do
+  !$omp end parallel do simd
+
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp parallel do simd if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end parallel do simd
+  
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp parallel do simd if(parallel: .true.) if(simd: .false.)
+  do i = 1, 10
+  end do
+  !$omp end parallel do simd
+  
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp parallel do simd if(parallel: .true.)
+  do i = 1, 10
+  end do
+  !$omp end parallel do simd
+  
+  ! CHECK:      omp.parallel
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp parallel do simd if(simd: .true.)
+  do i = 1, 10
+  end do
+  !$omp end parallel do simd
+
+  ! ----------------------------------------------------------------------------
+  ! SIMD
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.simdloop
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp simd
+  do i = 1, 10
+  end do
+  !$omp end simd
+
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp simd if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end simd
+
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp simd if(simd: .true.)
+  do i = 1, 10
+  end do
+  !$omp end simd
+
+  ! ----------------------------------------------------------------------------
+  ! TARGET
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target
+  !$omp end target
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  !$omp target if(.true.)
+  !$omp end target
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  !$omp target if(target: .true.)
+  !$omp end target
+
+  ! ----------------------------------------------------------------------------
+  ! TARGET DATA
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.target_data
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target data map(tofrom: i)
+  !$omp end target data
+
+  ! CHECK:      omp.target_data
+  ! CHECK-SAME: if({{.*}})
+  !$omp target data map(tofrom: i) if(.true.)
+  !$omp end target data
+
+  ! CHECK:      omp.target_data
+  ! CHECK-SAME: if({{.*}})
+  !$omp target data map(tofrom: i) if(target data: .true.)
+  !$omp end target data
+
+  ! ----------------------------------------------------------------------------
+  ! TARGET ENTER DATA
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.target_enter_data
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: map
+  !$omp target enter data map(to: i)
+
+  ! CHECK:      omp.target_enter_data
+  ! CHECK-SAME: if({{.*}})
+  !$omp target enter data map(to: i) if(.true.)
+
+  ! CHECK:      omp.target_enter_data
+  ! CHECK-SAME: if({{.*}})
+  !$omp target enter data map(to: i) if(target enter data: .true.)
+
+  ! ----------------------------------------------------------------------------
+  ! TARGET EXIT DATA
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.target_exit_data
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: map
+  !$omp target exit data map(from: i)
+
+  ! CHECK:      omp.target_exit_data
+  ! CHECK-SAME: if({{.*}})
+  !$omp target exit data map(from: i) if(.true.)
+
+  ! CHECK:      omp.target_exit_data
+  ! CHECK-SAME: if({{.*}})
+  !$omp target exit data map(from: i) if(target exit data: .true.)
+
+  ! ----------------------------------------------------------------------------
+  ! TARGET PARALLEL DO
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.parallel
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target parallel do
+  do i = 1, 10
+  end do
+  !$omp end target parallel do
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  !$omp target parallel do if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end target parallel do
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  !$omp target parallel do if(target: .true.) if(parallel: .false.)
+  do i = 1, 10
+  end do
+  !$omp end target parallel do
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.parallel
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target parallel do if(target: .true.)
+  do i = 1, 10
+  end do
+  !$omp end target parallel do
+
+  
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  !$omp target parallel do if(parallel: .true.)
+  do i = 1, 10
+  end do
+  !$omp end target parallel do
+
+  ! ----------------------------------------------------------------------------
+  ! TARGET PARALLEL DO SIMD
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.parallel
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.simdloop
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target parallel do simd
+  do i = 1, 10
+  end do
+  !$omp end target parallel do simd
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp target parallel do simd if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end target parallel do simd
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp target parallel do simd if(target: .true.) if(parallel: .false.) &
+  !$omp&                        if(simd: .true.)
+  do i = 1, 10
+  end do
+  !$omp end target parallel do simd
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.parallel
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.simdloop
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target parallel do simd if(target: .true.)
+  do i = 1, 10
+  end do
+  !$omp end target parallel do simd
+
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp target parallel do simd if(parallel: .true.) if(simd: .false.)
+  do i = 1, 10
+  end do
+  !$omp end target parallel do simd
+
+  ! ----------------------------------------------------------------------------
+  ! TARGET SIMD
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.simdloop
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target simd
+  do i = 1, 10
+  end do
+  !$omp end target simd
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp target simd if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end target simd
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp target simd if(target: .true.) if(simd: .false.)
+  do i = 1, 10
+  end do
+  !$omp end target simd
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.simdloop
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target simd if(target: .true.)
+  do i = 1, 10
+  end do
+  !$omp end target simd
+
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.simdloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp target simd if(simd: .true.)
+  do i = 1, 10
+  end do
+  !$omp end target simd
+
+  ! ----------------------------------------------------------------------------
+  ! TARGET TEAMS
+  ! ----------------------------------------------------------------------------
+
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.teams
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target teams
+  i = 1
+  !$omp end target teams
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.teams
+  ! CHECK-SAME: if({{.*}})
+  !$omp target teams if(.true.)
+  i = 1
+  !$omp end target teams
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.teams
+  ! CHECK-SAME: if({{.*}})
+  !$omp target teams if(target: .true.) if(teams: .false.)
+  i = 1
+  !$omp end target teams
+
+  ! CHECK:      omp.target
+  ! CHECK-SAME: if({{.*}})
+  ! CHECK:      omp.teams
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp target teams if(target: .true.)
+  i = 1
+  !$omp end target teams
+
+  ! CHECK:      omp.target
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  ! CHECK:      omp.teams
+  ! CHECK-SAME: if({{.*}})
+  !$omp target teams if(teams: .true.)
+  i = 1
+  !$omp end target teams
+
+  ! ----------------------------------------------------------------------------
+  ! TASK
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.task
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp task
+  !$omp end task
+
+  ! CHECK:      omp.task
+  ! CHECK-SAME: if({{.*}})
+  !$omp task if(.true.)
+  !$omp end task
+
+  ! CHECK:      omp.task
+  ! CHECK-SAME: if({{.*}})
+  !$omp task if(task: .true.)
+  !$omp end task
+
+  ! ----------------------------------------------------------------------------
+  ! TEAMS
+  ! ----------------------------------------------------------------------------
+  ! CHECK:      omp.teams
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
+  !$omp teams
+  i = 1
+  !$omp end teams
+
+  ! CHECK:      omp.teams
+  ! CHECK-SAME: if({{.*}})
+  !$omp teams if(.true.)
+  i = 1
+  !$omp end teams
+
+  ! CHECK:      omp.teams
+  ! CHECK-SAME: if({{.*}})
+  !$omp teams if(teams: .true.)
+  i = 1
+  !$omp end teams
+end program main
diff --git a/flang/test/Lower/OpenMP/is-device.f90 b/flang/test/Lower/OpenMP/is-device.f90
new file mode 100644
index 00000000000000..9df02d66d06493
--- /dev/null
+++ b/flang/test/Lower/OpenMP/is-device.f90
@@ -0,0 +1,14 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefix=HOST
+!RUN: %flang_fc1 -emit-hlfir -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE-FLAG-ONLY
+!RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir -o - %s | FileCheck %s --check-prefix=DEVICE
+!RUN: bbc -fopenmp -emit-hlfir -o - %s | FileCheck %s --check-prefix=HOST
+!RUN: bbc -fopenmp-is-target-device -emit-hlfir -o - %s | FileCheck %s --check-prefix=DEVICE-FLAG-ONLY
+
+!DEVICE: module attributes {{{.*}}, omp.is_target_device = true{{.*}}}
+!HOST: module attributes {{{.*}}, omp.is_target_device = false{{.*}}}
+!DEVICE-FLAG-ONLY: module attributes {{{.*}}"
+!DEVICE-FLAG-ONLY-NOT: , omp.is_target_device = {{.*}}
+!DEVICE-FLAG-ONLY-SAME: }
+subroutine omp_subroutine()
+end subroutine omp_subroutine
diff --git a/flang/test/Lower/OpenMP/omp-declare-target-program-var.f90 b/flang/test/Lower/OpenMP/omp-declare-target-program-var.f90
new file mode 100644
index 00000000000000..20538ff34871f5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/omp-declare-target-program-var.f90
@@ -0,0 +1,13 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes=HOST,ALL
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=ALL
+
+PROGRAM main
+    ! HOST-DAG: %[[I_REF:.*]] = fir.alloca f32 {bindc_name = "i", uniq_name = "_QFEi"}
+    ! HOST-DAG: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFEi"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+    REAL :: I
+    ! ALL-DAG: fir.global internal @_QFEi {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
+    ! ALL-DAG: %[[UNDEF:.*]] = fir.undefined f32
+    ! ALL-DAG: fir.has_value %[[UNDEF]] : f32
+    ! ALL-DAG: }
+    !$omp declare target(I)
+END
diff --git a/flang/test/Lower/OpenMP/omp-is-gpu.f90 b/flang/test/Lower/OpenMP/omp-is-gpu.f90
new file mode 100644
index 00000000000000..12d0e4e869fba5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/omp-is-gpu.f90
@@ -0,0 +1,14 @@
+!REQUIRES: amdgpu-registered-target
+
+!RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
+!RUN: bbc -fopenmp -fopenmp-is-target-device -fopenmp-is-gpu -emit-hlfir -o - %s | FileCheck %s
+
+!RUN: not %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp %s -o - 2>&1 | FileCheck %s --check-prefix=FLANG-ERROR
+!RUN: not bbc -fopenmp -fopenmp-is-gpu -emit-hlfir %s -o - 2>&1 | FileCheck %s --check-prefix=BBC-ERROR
+
+!CHECK: module attributes {{{.*}}omp.is_gpu = true
+subroutine omp_subroutine()
+end subroutine omp_subroutine
+
+!FLANG-ERROR: error: OpenMP AMDGPU/NVPTX is only prepared to deal with device code.
+!BBC-ERROR: FATAL: -fopenmp-is-gpu can only be set if -fopenmp-is-target-device is also set
diff --git a/flang/test/Lower/OpenMP/ordered-threads.f90 b/flang/test/Lower/OpenMP/ordered-threads.f90
new file mode 100644
index 00000000000000..a3f99129eba7ee
--- /dev/null
+++ b/flang/test/Lower/OpenMP/ordered-threads.f90
@@ -0,0 +1,25 @@
+! This test checks lowering of OpenMP ordered directive with threads Clause.
+! Without clause in ordered direcitve, it behaves as if threads clause is
+! specified.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+subroutine ordered
+        integer :: i
+        integer :: a(20)
+
+!CHECK: omp.ordered_region  {
+!$OMP ORDERED
+        a(i) = a(i-1) + 1
+!CHECK:   omp.terminator
+!CHECK-NEXT: }
+!$OMP END ORDERED
+
+!CHECK: omp.ordered_region  {
+!$OMP ORDERED THREADS
+        a(i) = a(i-1) + 1
+!CHECK:   omp.terminator
+!CHECK-NEXT: }
+!$OMP END ORDERED
+
+end
diff --git a/flang/test/Lower/OpenMP/requires-common.f90 b/flang/test/Lower/OpenMP/requires-common.f90
new file mode 100644
index 00000000000000..b3801a834014f7
--- /dev/null
+++ b/flang/test/Lower/OpenMP/requires-common.f90
@@ -0,0 +1,19 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck %s
+
+! This test checks the lowering of requires into MLIR
+
+!CHECK:      module attributes {
+!CHECK-SAME: omp.requires = #omp<clause_requires unified_shared_memory>
+block data init
+  !$omp requires unified_shared_memory
+  integer :: x
+  common /block/ x
+  data x / 10 /
+end
+
+subroutine f
+  !$omp declare target
+end subroutine f
diff --git a/flang/test/Lower/OpenMP/requires-notarget.f90 b/flang/test/Lower/OpenMP/requires-notarget.f90
new file mode 100644
index 00000000000000..6f555a1782908a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/requires-notarget.f90
@@ -0,0 +1,14 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck %s
+
+! This test checks that requires lowering into MLIR skips creating the
+! omp.requires attribute with target-related clauses if there are no device
+! functions in the compilation unit
+
+!CHECK:      module attributes {
+!CHECK-NOT:  omp.requires
+program requires
+  !$omp requires unified_shared_memory reverse_offload atomic_default_mem_order(seq_cst)
+end program requires
diff --git a/flang/test/Lower/OpenMP/requires.f90 b/flang/test/Lower/OpenMP/requires.f90
new file mode 100644
index 00000000000000..cec836f11a3a03
--- /dev/null
+++ b/flang/test/Lower/OpenMP/requires.f90
@@ -0,0 +1,14 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck %s
+
+! This test checks the lowering of requires into MLIR
+
+!CHECK:      module attributes {
+!CHECK-SAME: omp.requires = #omp<clause_requires reverse_offload|unified_shared_memory>
+program requires
+  !$omp requires unified_shared_memory reverse_offload atomic_default_mem_order(seq_cst)
+  !$omp target
+  !$omp end target
+end program requires
diff --git a/flang/test/Lower/OpenMP/rtl-flags.f90 b/flang/test/Lower/OpenMP/rtl-flags.f90
new file mode 100644
index 00000000000000..b38a6af0d7e3eb
--- /dev/null
+++ b/flang/test/Lower/OpenMP/rtl-flags.f90
@@ -0,0 +1,39 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=45 %s -o - | FileCheck %s --check-prefix=DEFAULT-HOST-FIR
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device -fopenmp-version=45  %s -o - | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR-VERSION
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=45 %s -o - | FileCheck %s --check-prefix=DEFAULT-HOST-FIR-VERSION
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-target-debug -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DBG-DEVICE-FIR
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-target-debug=111 -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DBG-EQ-DEVICE-FIR
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-assume-teams-oversubscription -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=TEAMS-OSUB-DEVICE-FIR
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-assume-threads-oversubscription -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=THREAD-OSUB-DEVICE-FIR
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-assume-no-thread-state -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=THREAD-STATE-DEVICE-FIR
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-assume-no-nested-parallelism -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=NEST-PAR-DEVICE-FIR
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-target-debug -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism -fopenmp-assume-threads-oversubscription -fopenmp-assume-no-thread-state -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=ALL-DEVICE-FIR
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device -fopenmp-version=45 -o - %s | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR-VERSION
+!RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s --check-prefix=DEFAULT-HOST-FIR
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=45 -o - %s | FileCheck %s --check-prefix=DEFAULT-HOST-FIR-VERSION
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-target-debug=111 -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=DBG-EQ-DEVICE-FIR
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-assume-teams-oversubscription -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=TEAMS-OSUB-DEVICE-FIR
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-assume-threads-oversubscription -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=THREAD-OSUB-DEVICE-FIR
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-assume-no-thread-state -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=THREAD-STATE-DEVICE-FIR
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-assume-no-nested-parallelism -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=NEST-PAR-DEVICE-FIR
+!RUN: bbc -emit-hlfir -fopenmp -fopenmp-target-debug=1 -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism -fopenmp-assume-threads-oversubscription -fopenmp-assume-no-thread-state -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=ALL-DEVICE-FIR
+
+!DEFAULT-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<openmp_device_version = 11>
+!DEFAULT-DEVICE-FIR-SAME: omp.is_target_device = true
+!DEFAULT-DEVICE-FIR-VERSION: module attributes {{{.*}}omp.flags = #omp.flags<openmp_device_version = 45>
+!DEFAULT-DEVICE-FIR-VERSION-SAME: omp.is_target_device = true
+!DEFAULT-DEVICE-FIR-VERSION-SAME: omp.version = #omp.version<version = 45>
+!DEFAULT-HOST-FIR: module attributes {{{.*}}omp.is_target_device = false{{.*}}
+!DEFAULT-HOST-FIR-VERSION: module attributes {{{.*}}omp.is_target_device = false
+!DEFAULT-HOST-FIR-VERSION-SAME: omp.version = #omp.version<version = 45>
+!DBG-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 1, openmp_device_version = 11>
+!DBG-EQ-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 111, openmp_device_version = 11>
+!TEAMS-OSUB-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_teams_oversubscription = true, openmp_device_version = 11>
+!THREAD-OSUB-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_threads_oversubscription = true, openmp_device_version = 11>
+!THREAD-STATE-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_no_thread_state = true, openmp_device_version = 11>
+!NEST-PAR-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_no_nested_parallelism = true, openmp_device_version = 11>
+!ALL-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 1, assume_teams_oversubscription = true, assume_threads_oversubscription = true, assume_no_thread_state = true, assume_no_nested_parallelism = true, openmp_device_version = 11>
+subroutine omp_subroutine()
+end subroutine omp_subroutine
diff --git a/flang/test/Lower/OpenMP/target_cpu_features.f90 b/flang/test/Lower/OpenMP/target_cpu_features.f90
new file mode 100644
index 00000000000000..46fb14efad5c03
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target_cpu_features.f90
@@ -0,0 +1,19 @@
+!REQUIRES: amdgpu-registered-target
+!RUN: %flang_fc1 -emit-hlfir -triple amdgcn-amd-amdhsa -target-cpu gfx908 -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
+
+!===============================================================================
+! Target_Enter Simple
+!===============================================================================
+
+!CHECK: omp.target = #omp.target<target_cpu = "gfx908",
+!CHECK-SAME: target_features = "+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,
+!CHECK-SAME: +dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,
+!CHECK-SAME: +gfx8-insts,+gfx9-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,
+!CHECK-SAME: +wavefrontsize64">
+!CHECK-LABEL: func.func @_QPomp_target_simple()
+subroutine omp_target_simple
+  ! Directive needed to prevent subroutine from being filtered out when
+  ! compiling for the device.
+  !$omp declare target
+end subroutine omp_target_simple
+
diff --git a/flang/test/Lower/OpenMP/taskgroup.f90 b/flang/test/Lower/OpenMP/taskgroup.f90
new file mode 100644
index 00000000000000..85dc253b59d659
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskgroup.f90
@@ -0,0 +1,20 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: @_QPomp_taskgroup
+subroutine omp_taskgroup
+use omp_lib
+integer :: allocated_x
+!CHECK: %[[ALLOC_X_REF:.*]] = fir.alloca i32 {bindc_name = "allocated_x", uniq_name = "_QFomp_taskgroupEallocated_x"}
+!CHECK-NEXT: %[[ALLOC_X_DECL:.*]]:2 = hlfir.declare %[[ALLOC_X_REF]] {uniq_name = "_QFomp_taskgroupEallocated_x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[C1:.*]] = arith.constant 1 : i32
+
+!CHECK: omp.taskgroup  allocate(%[[C1]] : i32 -> %[[ALLOC_X_DECL]]#1 : !fir.ref<i32>)
+!$omp taskgroup allocate(omp_high_bw_mem_alloc: allocated_x)
+!$omp task
+!CHECK: fir.call @_QPwork() {{.*}}: () -> ()
+   call work()
+!CHECK: omp.terminator
+!$omp end task
+!CHECK: omp.terminator
+!$omp end taskgroup
+end subroutine
diff --git a/flang/test/Lower/OpenMP/taskwait.f90 b/flang/test/Lower/OpenMP/taskwait.f90
new file mode 100644
index 00000000000000..b5c72aaab5943e
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskwait.f90
@@ -0,0 +1,11 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: @_QPomp_taskwait
+subroutine omp_taskwait
+  !CHECK: omp.taskwait
+  !$omp taskwait
+  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
+  call foo()
+  !CHECK: omp.taskwait
+  !$omp taskwait
+end subroutine omp_taskwait
diff --git a/flang/test/Lower/OpenMP/taskyield.f90 b/flang/test/Lower/OpenMP/taskyield.f90
new file mode 100644
index 00000000000000..28c1d025f6dc8a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskyield.f90
@@ -0,0 +1,11 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: @_QPomp_taskyield
+subroutine omp_taskyield
+  !CHECK: omp.taskyield
+  !$omp taskyield
+  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
+  call foo()
+  !CHECK: omp.taskyield
+  !$omp taskyield
+end subroutine omp_taskyield
diff --git a/flang/test/Lower/OpenMP/teams.f90 b/flang/test/Lower/OpenMP/teams.f90
new file mode 100644
index 00000000000000..2620d7752a4ed3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/teams.f90
@@ -0,0 +1,114 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPteams_simple
+subroutine teams_simple()
+  ! CHECK: omp.teams
+  !$omp teams
+  ! CHECK: fir.call
+  call f1()
+  ! CHECK: omp.terminator
+  !$omp end teams
+end subroutine teams_simple
+
+!===============================================================================
+! `num_teams` clause
+!===============================================================================
+
+! CHECK-LABEL: func @_QPteams_numteams
+subroutine teams_numteams(num_teams)
+  integer, intent(inout) :: num_teams
+
+  ! CHECK: omp.teams
+  ! CHECK-SAME: num_teams( to %{{.*}}: i32)
+  !$omp teams num_teams(4)
+  ! CHECK: fir.call
+  call f1()
+  ! CHECK: omp.terminator
+  !$omp end teams
+
+  ! CHECK: omp.teams
+  ! CHECK-SAME: num_teams( to %{{.*}}: i32)
+  !$omp teams num_teams(num_teams)
+  ! CHECK: fir.call
+  call f2()
+  ! CHECK: omp.terminator
+  !$omp end teams
+
+end subroutine teams_numteams
+
+!===============================================================================
+! `if` clause
+!===============================================================================
+
+! CHECK-LABEL: func @_QPteams_if
+subroutine teams_if(alpha)
+  integer, intent(in) :: alpha
+  logical :: condition
+
+  ! CHECK: omp.teams
+  ! CHECK-SAME: if(%{{.*}})
+  !$omp teams if(.false.)
+  ! CHECK: fir.call
+  call f1()
+  ! CHECK: omp.terminator
+  !$omp end teams
+
+  ! CHECK: omp.teams
+  ! CHECK-SAME: if(%{{.*}})
+  !$omp teams if(alpha .le. 0)
+  ! CHECK: fir.call
+  call f2()
+  ! CHECK: omp.terminator
+  !$omp end teams
+
+  ! CHECK: omp.teams
+  ! CHECK-SAME: if(%{{.*}})
+  !$omp teams if(condition)
+  ! CHECK: fir.call
+  call f3()
+  ! CHECK: omp.terminator
+  !$omp end teams
+end subroutine teams_if
+
+!===============================================================================
+! `thread_limit` clause
+!===============================================================================
+
+! CHECK-LABEL: func @_QPteams_threadlimit
+subroutine teams_threadlimit(thread_limit)
+  integer, intent(inout) :: thread_limit
+
+  ! CHECK: omp.teams
+  ! CHECK-SAME: thread_limit(%{{.*}}: i32)
+  !$omp teams thread_limit(4)
+  ! CHECK: fir.call
+  call f1()
+  ! CHECK: omp.terminator
+  !$omp end teams
+
+  ! CHECK: omp.teams
+  ! CHECK-SAME: thread_limit(%{{.*}}: i32)
+  !$omp teams thread_limit(thread_limit)
+  ! CHECK: fir.call
+  call f2()
+  ! CHECK: omp.terminator
+  !$omp end teams
+
+end subroutine teams_threadlimit
+
+!===============================================================================
+! `allocate` clause
+!===============================================================================
+
+! CHECK-LABEL: func @_QPteams_allocate
+subroutine teams_allocate()
+   use omp_lib
+   integer :: x
+   ! CHECK: omp.teams
+   ! CHECK-SAME: allocate(%{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>)
+   !$omp teams allocate(omp_high_bw_mem_alloc: x) private(x)
+   ! CHECK: arith.addi
+   x = x + 12
+   ! CHECK: omp.terminator
+   !$omp end teams
+end subroutine teams_allocate
diff --git a/flang/test/Lower/OpenMP/wsloop-chunks.f90 b/flang/test/Lower/OpenMP/wsloop-chunks.f90
new file mode 100644
index 00000000000000..5016c8985bda04
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-chunks.f90
@@ -0,0 +1,76 @@
+! This test checks that chunk size is passed correctly when lowering of
+! OpenMP DO Directive(Worksharing) with chunk size
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+program wsloop
+        integer :: i
+        integer :: chunk
+
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "wsloop"} {
+! CHECK:         %[[CHUNK_REF:.*]] = fir.alloca i32 {bindc_name = "chunk", uniq_name = "_QFEchunk"}
+! CHECK:         %[[VAL_0:.*]]:2 = hlfir.declare %[[CHUNK_REF]] {uniq_name = "_QFEchunk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+!$OMP DO SCHEDULE(static, 4)
+
+do i=1, 9
+  print*, i
+
+! CHECK:         %[[VAL_2:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_3:.*]] = arith.constant 9 : i32
+! CHECK:         %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_5:.*]] = arith.constant 4 : i32
+! CHECK:         omp.wsloop   schedule(static = %[[VAL_5]] : i32) nowait for  (%[[ARG0:.*]]) : i32 = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_4]]) {
+! CHECK:           fir.store %[[ARG0]] to %[[STORE_IV:.*]]#1 : !fir.ref<i32>
+! CHECK:           %[[LOAD_IV:.*]] = fir.load %[[STORE_IV]]#0 : !fir.ref<i32>
+! CHECK:           {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+! CHECK:           omp.yield
+! CHECK:         }
+
+end do
+!$OMP END DO NOWAIT
+!$OMP DO SCHEDULE(static, 2+2)
+
+do i=1, 9
+  print*, i*2
+
+! CHECK:         %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_15:.*]] = arith.constant 9 : i32
+! CHECK:         %[[VAL_16:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_17:.*]] = arith.constant 4 : i32
+! CHECK:         omp.wsloop   schedule(static = %[[VAL_17]] : i32) nowait for  (%[[ARG1:.*]]) : i32 = (%[[VAL_14]]) to (%[[VAL_15]]) inclusive step (%[[VAL_16]]) {
+! CHECK:           fir.store %[[ARG1]] to %[[STORE_IV1:.*]]#1 : !fir.ref<i32>
+! CHECK:           %[[VAL_24:.*]] = arith.constant 2 : i32
+! CHECK:           %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_25:.*]] = arith.muli %[[VAL_24]], %[[LOAD_IV1]] : i32
+! CHECK:           {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[VAL_25]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+! CHECK:           omp.yield
+! CHECK:         }
+  
+end do
+!$OMP END DO NOWAIT
+chunk = 6
+!$OMP DO SCHEDULE(static, chunk)
+
+do i=1, 9
+   print*, i*3
+end do
+!$OMP END DO NOWAIT
+! CHECK:         %[[VAL_28:.*]] = arith.constant 6 : i32
+! CHECK:         hlfir.assign %[[VAL_28]] to %[[VAL_0]]#0 : i32, !fir.ref<i32>
+! CHECK:         %[[VAL_29:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_30:.*]] = arith.constant 9 : i32
+! CHECK:         %[[VAL_31:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_32:.*]] = fir.load %[[VAL_0]]#0 : !fir.ref<i32>
+! CHECK:         omp.wsloop   schedule(static = %[[VAL_32]] : i32) nowait for  (%[[ARG2:.*]]) : i32 = (%[[VAL_29]]) to (%[[VAL_30]]) inclusive step (%[[VAL_31]]) {
+! CHECK:           fir.store %[[ARG2]] to %[[STORE_IV2:.*]]#1 : !fir.ref<i32>
+! CHECK:           %[[VAL_39:.*]] = arith.constant 3 : i32
+! CHECK:           %[[LOAD_IV2:.*]] = fir.load %[[STORE_IV2]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_40:.*]] = arith.muli %[[VAL_39]], %[[LOAD_IV2]] : i32
+! CHECK:           {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[VAL_40]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+! CHECK:           omp.yield
+! CHECK:         }
+! CHECK:         return
+! CHECK:       }
+
+end
diff --git a/flang/test/Lower/OpenMP/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/wsloop-collapse.f90
new file mode 100644
index 00000000000000..c93fcf4ef968da
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-collapse.f90
@@ -0,0 +1,77 @@
+! This test checks lowering of OpenMP DO Directive(Worksharing) with collapse.
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+!CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "wsloop_collapse"} {
+program wsloop_collapse
+!CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
+!CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
+!CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "c", uniq_name = "_QFEc"}
+!CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFEc"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_12:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+!CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_14:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
+!CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_16:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFEk"}
+!CHECK:           %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+!CHECK:           %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: i, j, k
+  integer :: a, b, c
+  integer :: x
+!CHECK:           %[[VAL_20:.*]] = arith.constant 3 : i32
+!CHECK:           hlfir.assign %[[VAL_20]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
+  a=3
+!CHECK:           %[[VAL_21:.*]] = arith.constant 2 : i32
+!CHECK:           hlfir.assign %[[VAL_21]] to %[[VAL_9]]#0 : i32, !fir.ref<i32>
+  b=2
+!CHECK:           %[[VAL_22:.*]] = arith.constant 5 : i32
+!CHECK:           hlfir.assign %[[VAL_22]] to %[[VAL_11]]#0 : i32, !fir.ref<i32>
+  c=5
+!CHECK:           %[[VAL_23:.*]] = arith.constant 0 : i32
+!CHECK:           hlfir.assign %[[VAL_23]] to %[[VAL_19]]#0 : i32, !fir.ref<i32>
+  x=0
+
+!CHECK:           %[[VAL_24:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_25:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+!CHECK:           %[[VAL_26:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_27:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_28:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+!CHECK:           %[[VAL_29:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_30:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_31:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+!CHECK:           %[[VAL_32:.*]] = arith.constant 1 : i32
+!CHECK:           omp.wsloop   for  (%[[VAL_33:.*]], %[[VAL_34:.*]], %[[VAL_35:.*]]) : i32 = (%[[VAL_24]], %[[VAL_27]], %[[VAL_30]]) to (%[[VAL_25]], %[[VAL_28]], %[[VAL_31]]) inclusive step (%[[VAL_26]], %[[VAL_29]], %[[VAL_32]]) {
+  !$omp do collapse(3)
+  do i = 1, a
+     do j= 1, b
+        do k = 1, c
+!CHECK:             fir.store %[[VAL_33]] to %[[VAL_5]]#1 : !fir.ref<i32>
+!CHECK:             fir.store %[[VAL_34]] to %[[VAL_3]]#1 : !fir.ref<i32>
+!CHECK:             fir.store %[[VAL_35]] to %[[VAL_1]]#1 : !fir.ref<i32>
+!CHECK:             %[[VAL_36:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+!CHECK:             %[[VAL_37:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+!CHECK:             %[[VAL_38:.*]] = arith.addi %[[VAL_36]], %[[VAL_37]] : i32
+!CHECK:             %[[VAL_39:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+!CHECK:             %[[VAL_40:.*]] = arith.addi %[[VAL_38]], %[[VAL_39]] : i32
+!CHECK:             %[[VAL_41:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+!CHECK:             %[[VAL_42:.*]] = arith.addi %[[VAL_40]], %[[VAL_41]] : i32
+!CHECK:             hlfir.assign %[[VAL_42]] to %[[VAL_19]]#0 : i32, !fir.ref<i32>
+!CHECK:             omp.yield
+           x = x + i + j + k
+        end do
+     end do
+  end do
+!CHECK:           }
+  !$omp end do
+!CHECK:         return
+!CHECK:       }
+end program wsloop_collapse
diff --git a/flang/test/Lower/OpenMP/wsloop-monotonic.f90 b/flang/test/Lower/OpenMP/wsloop-monotonic.f90
new file mode 100644
index 00000000000000..fba9105b981813
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-monotonic.f90
@@ -0,0 +1,34 @@
+! This test checks lowering of OpenMP DO Directive (Worksharing) with
+! monotonic schedule modifier.
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+program wsloop_dynamic
+  integer :: i
+!CHECK-LABEL: func @_QQmain()
+
+!$OMP PARALLEL
+!CHECK:  omp.parallel {
+
+!$OMP DO SCHEDULE(monotonic:dynamic)
+!CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+!CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+!CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+!CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+!CHECK:     omp.wsloop schedule(dynamic, monotonic) nowait for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+!CHECK:       fir.store %[[I]] to %[[ALLOCA_IV:.*]]#1 : !fir.ref<i32>
+
+  do i=1, 9
+    print*, i
+!CHECK:    %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
+!CHECK:    %[[LOAD:.*]] = fir.load %[[ALLOCA_IV]]#0 : !fir.ref<i32>
+!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+!CHECK:    fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
+  end do
+!CHECK:       omp.yield
+!CHECK:       omp.terminator
+!CHECK:     }
+
+!$OMP END DO NOWAIT
+!$OMP END PARALLEL
+end
diff --git a/flang/test/Lower/OpenMP/wsloop-nonmonotonic.f90 b/flang/test/Lower/OpenMP/wsloop-nonmonotonic.f90
new file mode 100644
index 00000000000000..1bd7a2edc0f523
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-nonmonotonic.f90
@@ -0,0 +1,37 @@
+! This test checks lowering of OpenMP DO Directive(Worksharing) with
+! non-monotonic schedule modifier.
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+program wsloop_dynamic
+  integer :: i
+!CHECK-LABEL: func @_QQmain()
+
+
+!$OMP PARALLEL
+!CHECK:  omp.parallel {
+
+!$OMP DO SCHEDULE(nonmonotonic:dynamic)
+!CHECK:     %[[I_REF:.*]] = fir.alloca i32 {{{.*}}, pinned}
+!CHECK:     %[[ALLOCA_IV:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+!CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+!CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+!CHECK:     omp.wsloop schedule(dynamic, nonmonotonic) nowait for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+!CHECK:       fir.store %[[I]] to %[[ALLOCA_IV]]#1 : !fir.ref<i32>
+
+  do i=1, 9
+    print*, i
+!CHECK:    %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
+!CHECK:    %[[LOAD:.*]] = fir.load %[[ALLOCA_IV]]#0 : !fir.ref<i32>
+!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+!CHECK:    fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
+  end do
+!CHECK:       omp.yield
+!CHECK:         }
+!CHECK:       omp.terminator
+!CHECK:     }
+
+!$OMP END DO NOWAIT
+!$OMP END PARALLEL
+end
diff --git a/flang/test/Lower/OpenMP/wsloop-ordered.f90 b/flang/test/Lower/OpenMP/wsloop-ordered.f90
new file mode 100644
index 00000000000000..5185d2d085bac7
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-ordered.f90
@@ -0,0 +1,40 @@
+! This test checks lowering of worksharing-loop construct with ordered clause.
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+! This checks lowering ordered clause specified without parameter
+subroutine wsloop_ordered_no_para()
+  integer :: a(10), i
+
+! CHECK:  omp.wsloop ordered(0) for (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
+! CHECK:    omp.yield
+! CHECK:  }
+
+  !$omp do ordered
+  do i = 2, 10
+    !$omp ordered
+    a(i) = a(i-1) + 1
+    !$omp end ordered
+  end do
+  !$omp end do
+
+end
+
+! This checks lowering ordered clause specified with a parameter
+subroutine wsloop_ordered_with_para()
+  integer :: a(10), i
+
+! CHECK: func @_QPwsloop_ordered_with_para() {
+! CHECK:  omp.wsloop ordered(1) for (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
+! CHECK:    omp.yield
+! CHECK:  }
+
+  !$omp do ordered(1)
+  do i = 2, 10
+    !!$omp ordered depend(sink: i-1)
+    a(i) = a(i-1) + 1
+    !!$omp ordered depend(source)
+  end do
+  !$omp end do
+
+end
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add.f90
new file mode 100644
index 00000000000000..92c0075b9f72a2
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add.f90
@@ -0,0 +1,294 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F64_NAME:.*]] : f64 init {
+!CHECK: ^bb0(%{{.*}}: f64):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
+!CHECK:  omp.yield(%[[C0_1]] : f64)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: f64, %[[ARG1:.*]]: f64):
+!CHECK:  %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f64
+!CHECK:  omp.yield(%[[RES]] : f64)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I64_NAME:.*]] : i64 init {
+!CHECK: ^bb0(%{{.*}}: i64):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i64
+!CHECK:  omp.yield(%[[C0_1]] : i64)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64):
+!CHECK:  %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i64
+!CHECK:  omp.yield(%[[RES]] : i64)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F32_NAME:.*]] : f32 init {
+!CHECK: ^bb0(%{{.*}}: f32):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  omp.yield(%[[C0_1]] : f32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
+!CHECK:  %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f32
+!CHECK:  omp.yield(%[[RES]] : f32)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
+!CHECK: ^bb0(%{{.*}}: i32):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:  omp.yield(%[[C0_1]] : i32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
+!CHECK:  %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+!CHECK:  omp.yield(%[[RES]] : i32)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_int_reduction
+!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[C0_2:.*]] = arith.constant 0 : i32
+!CHECK:  hlfir.assign %[[C0_2]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_int_reduction
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_reduction
+!CHECK:  %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[C0_2:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  hlfir.assign %[[C0_2]] to %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_real_reduction
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_int_reduction_switch_order
+!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %2 {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[C0_2:.*]] = arith.constant 0 : i32
+!CHECK:  hlfir.assign %[[C0_2]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_int_reduction_switch_order
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_reduction_switch_order
+!CHECK:  %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[C0_2:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  hlfir.assign %[[C0_2]] to %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_real_reduction_switch_order
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_int_reductions_same_type
+!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[YREF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<i32>, @[[RED_I32_NAME]] -> %[[Y_DECL]]#0 : !fir.ref<i32>, @[[RED_I32_NAME]] -> %[[Z_DECL]]#0 : !fir.ref<i32>) for  (%[[IVAL]]) : i32
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL1]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL2]], %[[Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL3:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL3]], %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_int_reductions_same_type
+  integer :: x,y,z
+  x = 0
+  y = 0
+  z = 0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_real_reductions_same_type
+!CHECK:  %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
+!CHECK:  %[[X_DECL]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[YREF:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
+!CHECK:  %[[Y_DECL]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
+!CHECK:  %[[Z_DECL]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<f32>, @[[RED_F32_NAME]] -> %[[Y_DECL]]#0 : !fir.ref<f32>, @[[RED_F32_NAME]] -> %[[Z_DECL]]#0 : !fir.ref<f32>) for  (%[[IVAL]]) : i32
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1_F32:.*]] = fir.convert %[[I_PVT_VAL1_I32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL1_F32]], %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      %[[I_PVT_VAL2_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2_F32:.*]] = fir.convert %[[I_PVT_VAL2_I32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL2_F32]], %[[Y_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      %[[I_PVT_VAL3_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[I_PVT_VAL3_I32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL3_F32]], %[[Z_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_real_reductions_same_type
+  real :: x,y,z
+  x = 0.0
+  y = 0.0
+  z = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_reductions_different_type
+!CHECK:  %[[WREF:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
+!CHECK:  %[[W_DECL:.*]]:2 = hlfir.declare %[[WREF]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[YREF:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[X_DECL]]#0  : !fir.ref<i32>, @[[RED_I64_NAME]] -> %[[Y_DECL]]#0 : !fir.ref<i64>, @[[RED_F32_NAME]] -> %[[Z_DECL]]#0  : !fir.ref<f32>, @[[RED_F64_NAME]] -> %[[W_DECL]]#0  : !fir.ref<f64>) for  (%[[IVAL:.*]]) : i32
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL1_I32]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2_I64:.*]] = fir.convert %[[I_PVT_VAL2_I32]] : (i32) -> i64
+!CHECK:      omp.reduction %[[I_PVT_VAL2_I64]], %[[Y_DECL]]#0 : i64, !fir.ref<i64>
+!CHECK:      %[[I_PVT_VAL3_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[I_PVT_VAL3_I32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL3_F32]], %[[Z_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      %[[I_PVT_VAL4_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL4_F64:.*]] = fir.convert %[[I_PVT_VAL4_I32]] : (i32) -> f64
+!CHECK:      omp.reduction %[[I_PVT_VAL4_F64]], %[[W_DECL]]#0 : f64, !fir.ref<f64>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_reductions_different_type
+  integer :: x
+  integer(kind=8) :: y
+  real :: z
+  real(kind=8) :: w
+  x = 0
+  y = 0
+  z = 0.0
+  w = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z,w)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+    w = w + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
new file mode 100644
index 00000000000000..29cd53616b5cb4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
@@ -0,0 +1,42 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK: omp.reduction.declare @[[IAND_DECLARE_I:.*]] : i32 init {
+!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant -1 : i32
+!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
+!CHECK: %[[IAND_VAL_I:.*]] = arith.andi %[[ARG0_I]], %[[ARG1_I]] : i32
+!CHECK: omp.yield(%[[IAND_VAL_I]] : i32)
+
+!CHECK-LABEL: @_QPreduction_iand
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+
+!CHECK: omp.parallel
+!CHECK: %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.wsloop reduction(@[[IAND_DECLARE_I]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for 
+!CHECK: fir.store %{{.*}} to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK: %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK: %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: omp.reduction %[[Y_I]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_iand(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(iand:x)
+  do i=1, 100
+    x = iand(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
new file mode 100644
index 00000000000000..3131d1b551737e
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
@@ -0,0 +1,43 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK: omp.reduction.declare @[[IEOR_DECLARE_I:.*]] : i32 init {
+!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant 0 : i32
+!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
+!CHECK: %[[IEOR_VAL_I:.*]] = arith.xori %[[ARG0_I]], %[[ARG1_I]] : i32
+!CHECK: omp.yield(%[[IEOR_VAL_I]] : i32)
+
+!CHECK-LABEL: @_QPreduction_ieor
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+
+
+!CHECK: omp.parallel
+!CHECK: %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_ieorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.wsloop reduction(@[[IEOR_DECLARE_I]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for 
+!CHECK: fir.store %{{.*}} to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK: %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK: %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: omp.reduction %[[Y_I]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_ieor(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ieor:x)
+  do i=1, 100
+    x = ieor(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
new file mode 100644
index 00000000000000..5e3d5bdd6c52c4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
@@ -0,0 +1,41 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK: omp.reduction.declare @[[IOR_DECLARE_I:.*]] : i32 init {
+!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant 0 : i32
+!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
+!CHECK: %[[IOR_VAL_I:.*]] = arith.ori %[[ARG0_I]], %[[ARG1_I]] : i32
+!CHECK: omp.yield(%[[IOR_VAL_I]] : i32)
+
+!CHECK-LABEL: @_QPreduction_ior
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!CHECK: omp.parallel
+!CHECK: %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.wsloop reduction(@[[IOR_DECLARE_I]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for 
+!CHECK: fir.store %{{.*}} to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK: %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK: %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: omp.reduction %[[Y_I]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_ior(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ior:x)
+  do i=1, 100
+    x = ior(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
new file mode 100644
index 00000000000000..243c8a1f874d51
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
@@ -0,0 +1,139 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_NAME:.*]] : !fir.logical<4> init {
+!CHECK: ^bb0(%{{.*}}: !fir.logical<4>):
+!CHECK:  %true = arith.constant true
+!CHECK:  %[[true_fir:.*]] = fir.convert %true : (i1) -> !fir.logical<4>
+!CHECK:  omp.yield(%[[true_fir]]  : !fir.logical<4>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.logical<4>, %[[ARG1:.*]]: !fir.logical<4>):
+!CHECK:  %[[arg0_i1:.*]] = fir.convert %[[ARG0]] : (!fir.logical<4>) -> i1
+!CHECK:  %[[arg1_i1:.*]] = fir.convert %[[ARG1]] : (!fir.logical<4>) -> i1
+!CHECK:  %[[RES:.*]] = arith.andi %[[arg0_i1]], %[[arg1_i1]] : i1
+!CHECK:  %[[RES_logical:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+!CHECK:  omp.yield(%[[RES_logical]] : !fir.logical<4>)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_reduction(
+!CHECK-SAME: %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%4) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_64:.*]] = fir.convert %[[I_PVT]] : (i32) -> i64
+!CHECK:      %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_PVT_64]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[Y_I_VAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[Y_I_VAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x)
+  do i=1, 100
+    x = x .and. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_reduction_switch_order(
+!CHECK-SAME: %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%{{.*}}) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[CONVI_64]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[YVAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[YVAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x)
+  do i=1, 100
+  x = y(i) .and. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_reductions
+!CHECK-SAME %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[W_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%{{.*}}) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[YREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[Y_DECL]]#0 : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[Z_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[I_PVT_VAL1]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_1]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_2]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[Y_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_2]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[Z_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x,y,z)
+  do i=1, 100
+  x = x .and. w(i)
+  y = y .and. w(i)
+  z = z .and. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
new file mode 100644
index 00000000000000..f25ed6300501b2
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
@@ -0,0 +1,139 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_NAME:.*]] : !fir.logical<4> init {
+!CHECK: ^bb0(%{{.*}}: !fir.logical<4>):
+!CHECK:  %true = arith.constant true
+!CHECK:  %[[true_fir:.*]] = fir.convert %true : (i1) -> !fir.logical<4>
+!CHECK:  omp.yield(%[[true_fir]]  : !fir.logical<4>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.logical<4>, %[[ARG1:.*]]: !fir.logical<4>):
+!CHECK:  %[[arg0_i1:.*]] = fir.convert %[[ARG0]] : (!fir.logical<4>) -> i1
+!CHECK:  %[[arg1_i1:.*]] = fir.convert %[[ARG1]] : (!fir.logical<4>) -> i1
+!CHECK:  %[[RES:.*]] = arith.cmpi eq, %[[arg0_i1]], %[[arg1_i1]] : i1
+!CHECK:  %[[RES_logical:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+!CHECK:  omp.yield(%[[RES_logical]] : !fir.logical<4>)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_reduction(
+!CHECK-SAME: %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%4) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_64:.*]] = fir.convert %[[I_PVT]] : (i32) -> i64
+!CHECK:      %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_PVT_64]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[Y_I_VAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[Y_I_VAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+    x = x .eqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_reduction_switch_order(
+!CHECK-SAME: %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%{{.*}}) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[CONVI_64]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[YVAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[YVAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+  x = y(i) .eqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_reductions
+!CHECK-SAME %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[W_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%{{.*}}) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[YREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[Y_DECL]]#0 :
+!!fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[Z_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[I_PVT_VAL1]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_1]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_2]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[Y_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_2]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[Z_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x,y,z)
+  do i=1, 100
+  x = x .eqv. w(i)
+  y = y .eqv. w(i)
+  z = z .eqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
new file mode 100644
index 00000000000000..54227cbed56029
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
@@ -0,0 +1,139 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_NAME:.*]] : !fir.logical<4> init {
+!CHECK: ^bb0(%{{.*}}: !fir.logical<4>):
+!CHECK:  %false = arith.constant false
+!CHECK:  %[[false_fir:.*]] = fir.convert %false : (i1) -> !fir.logical<4>
+!CHECK:  omp.yield(%[[false_fir]]  : !fir.logical<4>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.logical<4>, %[[ARG1:.*]]: !fir.logical<4>):
+!CHECK:  %[[arg0_i1:.*]] = fir.convert %[[ARG0]] : (!fir.logical<4>) -> i1
+!CHECK:  %[[arg1_i1:.*]] = fir.convert %[[ARG1]] : (!fir.logical<4>) -> i1
+!CHECK:  %[[RES:.*]] = arith.cmpi ne, %[[arg0_i1]], %[[arg1_i1]] : i1
+!CHECK:  %[[RES_logical:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+!CHECK:  omp.yield(%[[RES_logical]] : !fir.logical<4>)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_reduction(
+!CHECK-SAME: %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%4) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_64:.*]] = fir.convert %[[I_PVT]] : (i32) -> i64
+!CHECK:      %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_PVT_64]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[Y_I_VAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[Y_I_VAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+    x = x .neqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_reduction_switch_order(
+!CHECK-SAME: %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%{{.*}}) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[CONVI_64]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[YVAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[YVAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+  x = y(i) .neqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_reductions
+!CHECK-SAME %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[W_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%{{.*}}) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[YREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[Y_DECL]]#0 :
+!!fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[Z_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[I_PVT_VAL1]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_1]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_2]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[Y_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_2]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[Z_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x,y,z)
+  do i=1, 100
+  x = x .neqv. w(i)
+  y = y .neqv. w(i)
+  z = z .neqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
new file mode 100644
index 00000000000000..4f59ea778cf885
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
@@ -0,0 +1,140 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_NAME:.*]] : !fir.logical<4> init {
+!CHECK: ^bb0(%{{.*}}: !fir.logical<4>):
+!CHECK:  %false = arith.constant false
+!CHECK:  %[[false_fir:.*]] = fir.convert %false : (i1) -> !fir.logical<4>
+!CHECK:  omp.yield(%[[false_fir]]  : !fir.logical<4>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.logical<4>, %[[ARG1:.*]]: !fir.logical<4>):
+!CHECK:  %[[arg0_i1:.*]] = fir.convert %[[ARG0]] : (!fir.logical<4>) -> i1
+!CHECK:  %[[arg1_i1:.*]] = fir.convert %[[ARG1]] : (!fir.logical<4>) -> i1
+!CHECK:  %[[RES:.*]] = arith.ori %[[arg0_i1]], %[[arg1_i1]] : i1
+!CHECK:  %[[RES_logical:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+!CHECK:  omp.yield(%[[RES_logical]] : !fir.logical<4>)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_reduction(
+!CHECK-SAME: %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%4) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_64:.*]] = fir.convert %[[I_PVT]] : (i32) -> i64
+!CHECK:      %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_PVT_64]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[Y_I_VAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[Y_I_VAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x)
+  do i=1, 100
+    x = x .or. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_reduction_switch_order(
+!CHECK-SAME: %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%{{.*}}) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[CONVI_64]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[YVAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[YVAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x)
+  do i=1, 100
+  x = y(i) .or. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_reductions
+!CHECK-SAME %[[ARRAY:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[W_DECL:.*]]:2 = hlfir.declare %[[ARRAY]](%{{.*}}) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+!CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[YREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 100 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[X_DECL]]#0 : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[Y_DECL]]#0 :
+!!fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[Z_DECL]]#0 : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[I_PVT_VAL1]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_1]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[X_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_2]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[Y_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[W_I_REF:.*]] = hlfir.designate %[[W_DECL]]#0 (%[[CONVI_64_2]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+!CHECK:      %[[W_I_VAL:.*]] = fir.load %[[W_I_REF]] : !fir.ref<!fir.logical<4>>
+!CHECK:      omp.reduction %[[W_I_VAL]], %[[Z_DECL]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x,y,z)
+  do i=1, 100
+  x = x .or. w(i)
+  y = y .or. w(i)
+  z = z .or. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
new file mode 100644
index 00000000000000..a2c4b5470c26cf
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
@@ -0,0 +1,89 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK: omp.reduction.declare @[[MAX_DECLARE_F:.*]] : f32 init {
+!CHECK:   %[[MINIMUM_VAL_F:.*]] = arith.constant -3.40282347E+38 : f32
+!CHECK:   omp.yield(%[[MINIMUM_VAL_F]] : f32)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0_F:.*]]: f32, %[[ARG1_F:.*]]: f32):
+!CHECK:   %[[COMB_VAL_F:.*]] = arith.maximumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32
+!CHECK:   omp.yield(%[[COMB_VAL_F]] : f32)
+
+!CHECK: omp.reduction.declare @[[MAX_DECLARE_I:.*]] : i32 init {
+!CHECK:   %[[MINIMUM_VAL_I:.*]] = arith.constant -2147483648 : i32
+!CHECK:   omp.yield(%[[MINIMUM_VAL_I]] : i32)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
+!CHECK:   %[[COMB_VAL_I:.*]] = arith.maxsi %[[ARG0_I]], %[[ARG1_I]] : i32
+!CHECK:   omp.yield(%[[COMB_VAL_I]] : i32)
+
+!CHECK-LABEL: @_QPreduction_max_int
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
+!CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:   %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!CHECK:   omp.parallel
+!CHECK:      %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:      %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     omp.wsloop reduction(@[[MAX_DECLARE_I]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for
+!CHECK:       fir.store %arg1 to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK:       %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK:       %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK:       %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]]) : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+!CHECK:       %[[Y_I_VAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK:       omp.reduction %[[Y_I_VAL]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+!CHECK-LABEL: @_QPreduction_max_real
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
+!CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+!CHECK:   omp.parallel
+!CHECK:      %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:      %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     omp.wsloop reduction(@[[MAX_DECLARE_F]] -> %[[X_DECL]]#0 : !fir.ref<f32>) for
+!CHECK:       fir.store %arg1 to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK:       %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK:       %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK:       %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]]) : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+!CHECK:       %[[Y_I_VAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+subroutine reduction_max_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_max_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    !CHECK-NOT: omp.reduction
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
new file mode 100644
index 00000000000000..c1c91122efddc6
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
@@ -0,0 +1,89 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK: omp.reduction.declare @[[MIN_DECLARE_F:.*]] : f32 init {
+!CHECK:   %[[MAXIMUM_VAL_F:.*]] = arith.constant -1.401300e-45 : f32
+!CHECK:   omp.yield(%[[MAXIMUM_VAL_F]] : f32)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0_F:.*]]: f32, %[[ARG1_F:.*]]: f32):
+!CHECK:   %[[COMB_VAL_F:.*]] = arith.minimumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32
+!CHECK:   omp.yield(%[[COMB_VAL_F]] : f32)
+
+!CHECK: omp.reduction.declare @[[MIN_DECLARE_I:.*]] : i32 init {
+!CHECK:   %[[MAXIMUM_VAL_I:.*]] = arith.constant 2147483647 : i32
+!CHECK:   omp.yield(%[[MAXIMUM_VAL_I]] : i32)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
+!CHECK:   %[[COMB_VAL_I:.*]] = arith.minsi %[[ARG0_I]], %[[ARG1_I]] : i32
+!CHECK:   omp.yield(%[[COMB_VAL_I]] : i32)
+
+!CHECK-LABEL: @_QPreduction_min_int
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
+!CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:   %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!CHECK:   omp.parallel
+!CHECK:      %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:      %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     omp.wsloop reduction(@[[MIN_DECLARE_I]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for
+!CHECK:       fir.store %arg1 to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK:       %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK:       %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK:       %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]]) : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+!CHECK:       %[[Y_I_VAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK:       omp.reduction %[[Y_I_VAL]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+!CHECK-LABEL: @_QPreduction_min_real
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
+!CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+!CHECK:   omp.parallel
+!CHECK:      %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:      %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     omp.wsloop reduction(@[[MIN_DECLARE_F]] -> %[[X_DECL]]#0 : !fir.ref<f32>) for
+!CHECK:       fir.store %arg1 to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK:       %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK:       %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK:       %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]]) : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+!CHECK:       %[[Y_I_VAL:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+subroutine reduction_min_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_min_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+  !CHECK-NOT: omp.reduction
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90
new file mode 100644
index 00000000000000..7dc8aeeb85592b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90
@@ -0,0 +1,294 @@
+! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F64_NAME:.*]] : f64 init {
+!CHECK: ^bb0(%{{.*}}: f64):
+!CHECK:  %[[C0_1:.*]] = arith.constant 1.000000e+00 : f64
+!CHECK:  omp.yield(%[[C0_1]] : f64)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: f64, %[[ARG1:.*]]: f64):
+!CHECK:  %[[RES:.*]] = arith.mulf %[[ARG0]], %[[ARG1]] {{.*}}: f64
+!CHECK:  omp.yield(%[[RES]] : f64)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I64_NAME:.*]] : i64 init {
+!CHECK: ^bb0(%{{.*}}: i64):
+!CHECK:  %[[C1_1:.*]] = arith.constant 1 : i64
+!CHECK:  omp.yield(%[[C1_1]] : i64)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64):
+!CHECK:  %[[RES:.*]] = arith.muli %[[ARG0]], %[[ARG1]] : i64
+!CHECK:  omp.yield(%[[RES]] : i64)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F32_NAME:.*]] : f32 init {
+!CHECK: ^bb0(%{{.*}}: f32):
+!CHECK:  %[[C0_1:.*]] = arith.constant 1.000000e+00 : f32
+!CHECK:  omp.yield(%[[C0_1]] : f32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
+!CHECK:  %[[RES:.*]] = arith.mulf %[[ARG0]], %[[ARG1]] {{.*}}: f32
+!CHECK:  omp.yield(%[[RES]] : f32)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
+!CHECK: ^bb0(%{{.*}}: i32):
+!CHECK:  %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:  omp.yield(%[[C1_1]] : i32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
+!CHECK:  %[[RES:.*]] = arith.muli %[[ARG0]], %[[ARG1]] : i32
+!CHECK:  omp.yield(%[[RES]] : i32)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_int_reduction
+!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:  hlfir.assign %[[C1_2]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C10:.*]] = arith.constant 10 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C10]]) inclusive step (%[[C1_2]])
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 1
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+    x = x * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_reduction
+!CHECK:  %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[C0_2:.*]] = arith.constant 1.000000e+00 : f32
+!CHECK:  hlfir.assign %[[C0_2]] to %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 10 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_real_reduction
+  real :: x
+  x = 1.0
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+    x = x * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_int_reduction_switch_order
+!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:  hlfir.assign %[[C1_2]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C10:.*]] = arith.constant 10 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C10]]) inclusive step (%[[C1_2]])
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_int_reduction_switch_order
+  integer :: x
+  x = 1
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+  x = i * x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_reduction_switch_order
+!CHECK:  %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[C1_2:.*]] = arith.constant 1.000000e+00 : f32
+!CHECK:  hlfir.assign %[[C1_2]] to %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
+!CHECK:    %[[C100:.*]] = arith.constant 10 : i32
+!CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
+!CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine simple_real_reduction_switch_order
+  real :: x
+  x = 1.0
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+  x = i * x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_int_reductions_same_type
+!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[YREF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<i32>, @[[RED_I32_NAME]] -> %[[Y_DECL]]#0 : !fir.ref<i32>, @[[RED_I32_NAME]] -> %[[Z_DECL]]#0 : !fir.ref<i32>) for  (%[[IVAL]]) : i32
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL1]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL2]], %[[Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL3:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL3]], %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_int_reductions_same_type
+  integer :: x,y,z
+  x = 1
+  y = 1
+  z = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z)
+  do i=1, 10
+  x = x * i
+  y = y * i
+  z = z * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_real_reductions_same_type
+!CHECK:  %[[XREF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[YREF:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[X_DECL]]#0 : !fir.ref<f32>, @[[RED_F32_NAME]] -> %[[Y_DECL]]#0 : !fir.ref<f32>, @[[RED_F32_NAME]] -> %[[Z_DECL]]#0 : !fir.ref<f32>) for  (%[[IVAL]]) : i32
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1_F32:.*]] = fir.convert %[[I_PVT_VAL1_I32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL1_F32]], %[[X_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      %[[I_PVT_VAL2_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2_F32:.*]] = fir.convert %[[I_PVT_VAL2_I32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL2_F32]], %[[Y_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      %[[I_PVT_VAL3_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[I_PVT_VAL3_I32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL3_F32]], %[[Z_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      omp.yield
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_real_reductions_same_type
+  real :: x,y,z
+  x = 1
+  y = 1
+  z = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z)
+  do i=1, 10
+    x = x * i
+    y = y * i
+    z = z * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPmultiple_reductions_different_type
+!CHECK:  %[[WREF:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
+!CHECK:  %[[W_DECL:.*]]:2 = hlfir.declare %[[WREF]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+!CHECK:  %[[XREF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[XREF]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[YREF:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[YREF]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK:  %[[ZREF:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[ZREF]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  omp.parallel
+!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[X_DECL]]#0  : !fir.ref<i32>, @[[RED_I64_NAME]] -> %[[Y_DECL]]#0 : !fir.ref<i64>, @[[RED_F32_NAME]] -> %[[Z_DECL]]#0  : !fir.ref<f32>, @[[RED_F64_NAME]] -> %[[W_DECL]]#0  : !fir.ref<f64>) for  (%[[IVAL:.*]]) : i32
+!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL1_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      omp.reduction %[[I_PVT_VAL1_I32]], %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2_I64:.*]] = fir.convert %[[I_PVT_VAL2_I32]] : (i32) -> i64
+!CHECK:      omp.reduction %[[I_PVT_VAL2_I64]], %[[Y_DECL]]#0 : i64, !fir.ref<i64>
+!CHECK:      %[[I_PVT_VAL3_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[I_PVT_VAL3_I32]] : (i32) -> f32
+!CHECK:      omp.reduction %[[I_PVT_VAL3_F32]], %[[Z_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:      %[[I_PVT_VAL4_I32:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL4_F64:.*]] = fir.convert %[[I_PVT_VAL4_I32]] : (i32) -> f64
+!CHECK:      omp.reduction %[[I_PVT_VAL4_F64]], %[[W_DECL]]#0 : f64, !fir.ref<f64>
+!CHECK:    omp.terminator
+!CHECK:  return
+subroutine multiple_reductions_different_type
+  integer :: x
+  integer(kind=8) :: y
+  real :: z
+  real(kind=8) :: w
+  x = 1
+  y = 1
+  z = 1
+  w = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z,w)
+  do i=1, 10
+    x = x * i
+    y = y * i
+    z = z * i
+    w = w * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
new file mode 100644
index 00000000000000..c3d5e3e0cda593
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -0,0 +1,34 @@
+! This test checks lowering of OpenMP DO Directive(Worksharing) with
+! simd schedule modifier.
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+program wsloop_dynamic
+  integer :: i
+!CHECK-LABEL: func @_QQmain()
+
+!$OMP PARALLEL
+!CHECK:  omp.parallel {
+
+!$OMP DO SCHEDULE(simd: runtime)
+!CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+!CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+!CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+!CHECK:     omp.wsloop schedule(runtime, simd) nowait for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+!CHECK:       fir.store %[[I]] to %[[STORE:.*]]#1 : !fir.ref<i32>
+
+  do i=1, 9
+    print*, i
+!CHECK:    %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
+!CHECK:    %[[LOAD:.*]] = fir.load %[[STORE]]#0 : !fir.ref<i32>
+!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+!CHECK:    fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
+  end do
+!CHECK:       omp.yield
+!CHECK:         }
+!CHECK:       omp.terminator
+!CHECK:     }
+
+!$OMP END DO NOWAIT
+!$OMP END PARALLEL
+end
diff --git a/flang/test/Lower/OpenMP/wsloop-variable.f90 b/flang/test/Lower/OpenMP/wsloop-variable.f90
new file mode 100644
index 00000000000000..b3758f1fdc00ff
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-variable.f90
@@ -0,0 +1,186 @@
+! This test checks lowering of OpenMP DO Directive(Worksharing) for different
+! types of loop iteration variable, lower bound, upper bound, and step.
+
+!REQUIRES: shell
+!RUN: bbc -fopenmp -emit-hlfir %s -o - 2>&1 | FileCheck %s
+
+!CHECK:  OpenMP loop iteration variable cannot have more than 64 bits size and will be narrowed into 64 bits.
+
+program wsloop_variable
+  integer(kind=1) :: i1_lb, i1_ub
+  integer(kind=2) :: i2, i2_ub, i2_s
+  integer(kind=4) :: i4_s
+  integer(kind=8) :: i8, i8_s
+  integer(kind=16) :: i16, i16_lb
+  real :: x
+
+!CHECK:  %[[TMP0:.*]] = arith.constant 1 : i32
+!CHECK:  %[[TMP1:.*]] = arith.constant 100 : i32
+!CHECK:  %[[TMP2:.*]] = fir.convert %[[TMP0]] : (i32) -> i64
+!CHECK:  %[[TMP3:.*]] = fir.convert %{{.*}} : (i8) -> i64
+!CHECK:  %[[TMP4:.*]] = fir.convert %{{.*}} : (i16) -> i64
+!CHECK:  %[[TMP5:.*]] = fir.convert %{{.*}} : (i128) -> i64
+!CHECK:  %[[TMP6:.*]] = fir.convert %[[TMP1]] : (i32) -> i64
+!CHECK:  %[[TMP7:.*]] = fir.convert %{{.*}} : (i32) -> i64
+!CHECK:  omp.wsloop for (%[[ARG0:.*]], %[[ARG1:.*]]) : i64 = (%[[TMP2]], %[[TMP5]]) to (%[[TMP3]], %[[TMP6]]) inclusive step (%[[TMP4]], %[[TMP7]]) {
+!CHECK:    %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i64) -> i16
+!CHECK:    fir.store %[[ARG0_I16]] to %[[STORE_IV0:.*]]#1 : !fir.ref<i16>
+!CHECK:    fir.store %[[ARG1]] to %[[STORE_IV1:.*]]#1 : !fir.ref<i64>
+!CHECK:    %[[LOAD_IV0:.*]] = fir.load %[[STORE_IV0]]#0 : !fir.ref<i16>
+!CHECK:    %[[LOAD_IV0_I64:.*]] = fir.convert %[[LOAD_IV0]] : (i16) -> i64
+!CHECK:    %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]]#0 : !fir.ref<i64>
+!CHECK:    %[[TMP10:.*]] = arith.addi %[[LOAD_IV0_I64]], %[[LOAD_IV1]] : i64
+!CHECK:    %[[TMP11:.*]] = fir.convert %[[TMP10]] : (i64) -> f32
+!CHECK:    hlfir.assign %[[TMP11]] to %{{.*}} : f32, !fir.ref<f32>
+!CHECK:    omp.yield
+!CHECK:  }
+
+  !$omp do collapse(2)
+  do i2 = 1, i1_ub, i2_s
+    do i8 = i16_lb, 100, i4_s
+      x = i2 + i8
+    end do
+  end do
+  !$omp end do
+
+!CHECK:  %[[TMP12:.*]] = arith.constant 1 : i32
+!CHECK:  %[[TMP13:.*]] = fir.convert %{{.*}} : (i8) -> i32
+!CHECK:  %[[TMP14:.*]] = fir.convert %{{.*}} : (i64) -> i32
+!CHECK:  omp.wsloop for (%[[ARG0:.*]]) : i32 = (%[[TMP12]]) to (%[[TMP13]]) inclusive step (%[[TMP14]])  {
+!CHECK:    %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i32) -> i16
+!CHECK:    fir.store %[[ARG0_I16]] to %[[STORE3:.*]]#1 : !fir.ref<i16>
+!CHECK:    %[[LOAD3:.*]] = fir.load %[[STORE3]]#0 : !fir.ref<i16>
+!CHECK:    %[[TMP16:.*]] = fir.convert %[[LOAD3]] : (i16) -> f32
+!CHECK:    hlfir.assign %[[TMP16]] to %{{.*}} : f32, !fir.ref<f32>
+!CHECK:    omp.yield
+!CHECK:  }
+
+  !$omp do
+  do i2 = 1, i1_ub, i8_s
+    x = i2
+  end do
+  !$omp end do
+
+!CHECK:  %[[TMP17:.*]] = fir.convert %{{.*}} : (i8) -> i64
+!CHECK:  %[[TMP18:.*]] = fir.convert %{{.*}} : (i16) -> i64
+!CHECK:  %[[TMP19:.*]] = fir.convert %{{.*}} : (i32) -> i64
+!CHECK:  omp.wsloop for (%[[ARG1:.*]]) : i64 = (%[[TMP17]]) to (%[[TMP18]]) inclusive step (%[[TMP19]])  {
+!CHECK:    %[[ARG1_I128:.*]] = fir.convert %[[ARG1]] : (i64) -> i128
+!CHECK:    fir.store %[[ARG1_I128]] to %[[STORE4:.*]]#1 : !fir.ref<i128>
+!CHECK:    %[[LOAD4:.*]] = fir.load %[[STORE4]]#0 : !fir.ref<i128>
+!CHECK:    %[[TMP21:.*]] = fir.convert %[[LOAD4]] : (i128) -> f32
+!CHECK:    hlfir.assign %[[TMP21]] to %{{.*}} : f32, !fir.ref<f32>
+!CHECK:    omp.yield
+!CHECK:  }
+
+  !$omp do
+  do i16 = i1_lb, i2_ub, i4_s
+    x = i16
+  end do
+  !$omp end do
+
+end program wsloop_variable
+
+!CHECK-LABEL: func.func @_QPwsloop_variable_sub() {
+!CHECK:           %[[VAL_0:.*]] = fir.alloca i8 {adapt.valuebyref, pinned}
+!CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFwsloop_variable_subEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK:           %[[VAL_2:.*]] = fir.alloca i16 {adapt.valuebyref, pinned}
+!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFwsloop_variable_subEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK:           %[[VAL_4:.*]] = fir.alloca i8 {bindc_name = "i1", uniq_name = "_QFwsloop_variable_subEi1"}
+!CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFwsloop_variable_subEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK:           %[[VAL_6:.*]] = fir.alloca i128 {bindc_name = "i16_lb", uniq_name = "_QFwsloop_variable_subEi16_lb"}
+!CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFwsloop_variable_subEi16_lb"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!CHECK:           %[[VAL_8:.*]] = fir.alloca i8 {bindc_name = "i1_ub", uniq_name = "_QFwsloop_variable_subEi1_ub"}
+!CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFwsloop_variable_subEi1_ub"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK:           %[[VAL_10:.*]] = fir.alloca i16 {bindc_name = "i2", uniq_name = "_QFwsloop_variable_subEi2"}
+!CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFwsloop_variable_subEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK:           %[[VAL_12:.*]] = fir.alloca i16 {bindc_name = "i2_s", uniq_name = "_QFwsloop_variable_subEi2_s"}
+!CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFwsloop_variable_subEi2_s"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK:           %[[VAL_14:.*]] = fir.alloca i32 {bindc_name = "i4_s", uniq_name = "_QFwsloop_variable_subEi4_s"}
+!CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFwsloop_variable_subEi4_s"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_16:.*]] = fir.alloca i64 {bindc_name = "i8", uniq_name = "_QFwsloop_variable_subEi8"}
+!CHECK:           %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFwsloop_variable_subEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK:           %[[VAL_18:.*]] = fir.alloca i8 {bindc_name = "j1", uniq_name = "_QFwsloop_variable_subEj1"}
+!CHECK:           %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFwsloop_variable_subEj1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK:           %[[VAL_20:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFwsloop_variable_subEx"}
+!CHECK:           %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFwsloop_variable_subEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+subroutine wsloop_variable_sub
+  integer(kind=1) :: i1, i1_ub, j1
+  integer(kind=2) :: i2, i2_s
+  integer(kind=4) :: i4_s
+  integer(kind=8) :: i8
+  integer(kind=16) :: i16_lb
+  real :: x
+
+!CHECK:           %[[VAL_22:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_23:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i8>
+!CHECK:           %[[VAL_24:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i16>
+!CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_23]] : (i8) -> i32
+!CHECK:           %[[VAL_26:.*]] = fir.convert %[[VAL_24]] : (i16) -> i32
+!CHECK:           omp.wsloop   for  (%[[VAL_27:.*]]) : i32 = (%[[VAL_22]]) to (%[[VAL_25]]) inclusive step (%[[VAL_26]]) {
+!CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_27]] : (i32) -> i16
+!CHECK:             fir.store %[[VAL_28]] to %[[VAL_3]]#1 : !fir.ref<i16>
+!CHECK:             %[[VAL_29:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i128>
+!CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i128) -> index
+!CHECK:             %[[VAL_31:.*]] = arith.constant 100 : i32
+!CHECK:             %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> index
+!CHECK:             %[[VAL_33:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+!CHECK:             %[[VAL_34:.*]] = fir.convert %[[VAL_33]] : (i32) -> index
+!CHECK:             %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (index) -> i64
+!CHECK:             %[[VAL_36:.*]]:2 = fir.do_loop %[[VAL_37:.*]] = %[[VAL_30]] to %[[VAL_32]] step %[[VAL_34]] iter_args(%[[VAL_38:.*]] = %[[VAL_35]]) -> (index, i64) {
+!CHECK:               fir.store %[[VAL_38]] to %[[VAL_17]]#1 : !fir.ref<i64>
+!CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i16>
+!CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i16) -> i64
+!CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<i64>
+!CHECK:               %[[VAL_42:.*]] = arith.addi %[[VAL_40]], %[[VAL_41]] : i64
+!CHECK:               %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i64) -> f32
+!CHECK:               hlfir.assign %[[VAL_43]] to %[[VAL_21]]#0 : f32, !fir.ref<f32>
+!CHECK:               %[[VAL_44:.*]] = arith.addi %[[VAL_37]], %[[VAL_34]] : index
+!CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_34]] : (index) -> i64
+!CHECK:               %[[VAL_46:.*]] = fir.load %[[VAL_17]]#1 : !fir.ref<i64>
+!CHECK:               %[[VAL_47:.*]] = arith.addi %[[VAL_46]], %[[VAL_45]] : i64
+!CHECK:               fir.result %[[VAL_44]], %[[VAL_47]] : index, i64
+!CHECK:             }
+!CHECK:             fir.store %[[VAL_48:.*]]#1 to %[[VAL_17]]#1 : !fir.ref<i64>
+!CHECK:             omp.yield
+!CHECK:           }
+
+  !$omp do
+  do i2 = 1, i1_ub, i2_s
+    do i8 = i16_lb, 100, i4_s
+      x = i2 + i8
+    end do
+  end do
+  !$omp end do
+
+
+!CHECK:           %[[VAL_49:.*]] = arith.constant 5 : i8
+!CHECK:           hlfir.assign %[[VAL_49]] to %[[VAL_19]]#0 : i8, !fir.ref<i8>
+!CHECK:           %[[VAL_50:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_51:.*]] = arith.constant 10 : i32
+!CHECK:           %[[VAL_52:.*]] = arith.constant 1 : i32
+!CHECK:           omp.wsloop   for  (%[[VAL_53:.*]]) : i32 = (%[[VAL_50]]) to (%[[VAL_51]]) inclusive step (%[[VAL_52]]) {
+!CHECK:             %[[VAL_54:.*]] = fir.convert %[[VAL_53]] : (i32) -> i8
+!CHECK:             fir.store %[[VAL_54]] to %[[VAL_1]]#1 : !fir.ref<i8>
+!CHECK:             %[[VAL_55:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i8>
+!CHECK:             %[[VAL_56:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i8>
+!CHECK:             %[[VAL_57:.*]] = arith.cmpi eq, %[[VAL_55]], %[[VAL_56]] : i8
+!CHECK:             fir.if %[[VAL_57]] {
+!CHECK:             } else {
+!CHECK:             }
+!CHECK:             omp.yield
+!CHECK:           }
+  j1 = 5
+  !$omp do
+  do i1 = 1, 10
+    if (i1 .eq. j1) then
+      print *, "EQ"
+    end if
+  end do
+  !$omp end do
+
+!CHECK:         return
+!CHECK:       }
+
+end
diff --git a/flang/test/Lower/allocatable-assignment.f90 b/flang/test/Lower/allocatable-assignment.f90
index 5d2f28177bcbfb..1a1ca935cf775b 100644
--- a/flang/test/Lower/allocatable-assignment.f90
+++ b/flang/test/Lower/allocatable-assignment.f90
@@ -736,8 +736,8 @@ subroutine test_cst_char(x, c)
   character(10), allocatable  :: x(:)
   character(12) :: c(20)
 ! CHECK:         %[[VAL_2:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:         %[[VAL_3:.*]] = arith.constant 12 : index
 ! CHECK:         %[[VAL_4:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<20x!fir.char<1,12>>>
+! CHECK:         %[[VAL_3:.*]] = arith.constant 12 : index
 ! CHECK:         %[[VAL_5:.*]] = arith.constant 20 : index
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 20 : index
 ! CHECK:         %[[VAL_7:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/allocatable-callee.f90 b/flang/test/Lower/allocatable-callee.f90
index 5627cd44f62e12..37c57926db0483 100644
--- a/flang/test/Lower/allocatable-callee.f90
+++ b/flang/test/Lower/allocatable-callee.f90
@@ -34,8 +34,7 @@ subroutine test_char_scalar_deferred(c)
   ! CHECK: %[[box:.*]] = fir.load %[[arg0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
   ! CHECK-DAG: %[[len:.*]] = fir.box_elesize %[[box]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
-  ! CHECK-DAG: %[[addr_cast:.*]] = fir.convert %[[addr]] : (!fir.heap<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>>
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr_cast]], %[[len]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %[[len]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.boxchar<1>
   ! CHECK: fir.call @_QPfoo1(%[[boxchar]]) {{.*}}: (!fir.boxchar<1>) -> ()
 end subroutine
 
@@ -47,8 +46,7 @@ subroutine test_char_scalar_explicit_cst(c)
   call foo1(c)
   ! CHECK: %[[box:.*]] = fir.load %[[arg0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.char<1,10>>>) -> !fir.heap<!fir.char<1,10>>
-  ! CHECK-DAG: %[[addr_cast:.*]] = fir.convert %[[addr]] : (!fir.heap<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,?>>
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr_cast]], %c10{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}} : (!fir.heap<!fir.char<1,10>>, index) -> !fir.boxchar<1>
   ! CHECK: fir.call @_QPfoo1(%[[boxchar]]) {{.*}}: (!fir.boxchar<1>) -> ()
 end subroutine
 
@@ -68,9 +66,8 @@ subroutine test_char_scalar_explicit_dynamic(c, n)
   call foo1(c)
   ! CHECK: %[[box:.*]] = fir.load %[[arg0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
-  ! CHECK-DAG: %[[addr_cast:.*]] = fir.convert %[[addr]] : (!fir.heap<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>>
   ! CHECK-DAG: %[[len_cast:.*]] = fir.convert %[[len]] : (i32) -> index
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr_cast]], %[[len_cast]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %[[len_cast]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.boxchar<1>
   ! CHECK: fir.call @_QPfoo1(%[[boxchar]]) {{.*}}: (!fir.boxchar<1>) -> ()
 end subroutine
 
@@ -98,7 +95,7 @@ subroutine test_char_array_explicit_cst(c)
   ! CHECK: %[[box:.*]] = fir.load %[[arg0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>) -> !fir.heap<!fir.array<?x!fir.char<1,10>>>
   ! [...] address computation
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %{{.*}}, %c10{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %{{.*}}, %c10{{.*}} : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
   ! CHECK: fir.call @_QPfoo1(%[[boxchar]]) {{.*}}: (!fir.boxchar<1>) -> ()
 end subroutine
 
@@ -138,8 +135,7 @@ subroutine test_char_scalar_deferred_k2(c)
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.char<2,?>>>) -> !fir.heap<!fir.char<2,?>>
   ! CHECK-DAG: %[[size:.*]] = fir.box_elesize %[[box]] : (!fir.box<!fir.heap<!fir.char<2,?>>>) -> index
   ! CHECK-DAG: %[[len:.*]] = arith.divsi %[[size]], %c2{{.*}} : index
-  ! CHECK-DAG: %[[addr_cast:.*]] = fir.convert %[[addr]] : (!fir.heap<!fir.char<2,?>>) -> !fir.ref<!fir.char<2,?>>
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr_cast]], %[[len]] : (!fir.ref<!fir.char<2,?>>, index) -> !fir.boxchar<2>
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %[[len]] : (!fir.heap<!fir.char<2,?>>, index) -> !fir.boxchar<2>
   ! CHECK: fir.call @_QPfoo2(%[[boxchar]]) {{.*}}: (!fir.boxchar<2>) -> ()
 end subroutine
 
diff --git a/flang/test/Lower/allocatables.f90 b/flang/test/Lower/allocatables.f90
index 44f94d453cce98..24d3647210afca 100644
--- a/flang/test/Lower/allocatables.f90
+++ b/flang/test/Lower/allocatables.f90
@@ -97,8 +97,7 @@ subroutine char_deferred(n)
   call bar(c)
   ! CHECK-DAG: %[[cLen:.*]] = fir.load %[[cLenVar]] : !fir.ref<index>
   ! CHECK-DAG: %[[cAddr:.*]] = fir.load %[[cAddrVar]] : !fir.ref<!fir.heap<!fir.char<1,?>>>
-  ! CHECK-DAG: %[[cAddrcast:.*]] = fir.convert %[[cAddr]] : (!fir.heap<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>>
-  ! CHECK: fir.emboxchar %[[cAddrcast]], %[[cLen]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: fir.emboxchar %[[cAddr]], %[[cLen]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 end subroutine
 
 ! CHECK-LABEL: _QPchar_explicit_cst(
@@ -120,8 +119,7 @@ subroutine char_explicit_cst(n)
   ! CHECK: fir.allocmem !fir.char<1,10> {{{.*}}uniq_name = "_QFchar_explicit_cstEc.alloc"}
   call bar(c)
   ! CHECK: %[[cAddr:.*]] = fir.load %[[cAddrVar]] : !fir.ref<!fir.heap<!fir.char<1,10>>>
-  ! CHECK: %[[cAddrcast:.*]] = fir.convert %[[cAddr]] : (!fir.heap<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,?>>
-  ! CHECK: fir.emboxchar %[[cAddrcast]], %[[cLen]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: fir.emboxchar %[[cAddr]], %[[cLen]] : (!fir.heap<!fir.char<1,10>>, index) -> !fir.boxchar<1>
 end subroutine
 
 ! CHECK-LABEL: _QPchar_explicit_dyn(
@@ -150,8 +148,7 @@ subroutine char_explicit_dyn(l1, l2)
   call bar(c)
   ! CHECK-DAG: %[[cLenCast4:.*]] = fir.convert %[[cLen]] : (i32) -> index
   ! CHECK-DAG: %[[cAddr:.*]] = fir.load %[[cAddrVar]] : !fir.ref<!fir.heap<!fir.char<1,?>>>
-  ! CHECK-DAG: %[[cAddrcast:.*]] = fir.convert %[[cAddr]] : (!fir.heap<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>>
-  ! CHECK: fir.emboxchar %[[cAddrcast]], %[[cLenCast4]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: fir.emboxchar %[[cAddr]], %[[cLenCast4]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 end subroutine
 
 ! CHECK-LABEL: _QPspecifiers(
diff --git a/flang/test/Lower/array-elemental-calls-char-byval.f90 b/flang/test/Lower/array-elemental-calls-char-byval.f90
index 08f73e6dc4ddac..de6457316d5349 100644
--- a/flang/test/Lower/array-elemental-calls-char-byval.f90
+++ b/flang/test/Lower/array-elemental-calls-char-byval.f90
@@ -104,8 +104,7 @@ subroutine foo3(i, j)
 ! CHECK:   %[[VAL_59:.*]] = fir.insert_value %[[VAL_58]], %[[VAL_57]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1>
 ! CHECK:   %[[VAL_60:.*]] = fir.alloca !fir.char<1> {bindc_name = ".chrtmp"}
 ! CHECK:   fir.store %[[VAL_59]] to %[[VAL_60]] : !fir.ref<!fir.char<1>>
-! CHECK:   %[[VAL_61:.*]] = fir.convert %[[VAL_60]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:   %[[VAL_62:.*]] = fir.emboxchar %[[VAL_61]], %[[VAL_48]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:   %[[VAL_62:.*]] = fir.emboxchar %[[VAL_60]], %[[VAL_48]] : (!fir.ref<!fir.char<1>>, index) -> !fir.boxchar<1>
 ! CHECK:   %[[VAL_63:.*]] = fir.call @_QPelem(%[[VAL_62]], %[[VAL_54]]) {{.*}}: (!fir.boxchar<1>, !fir.ref<i32>) -> i32
 ! CHECK:   %[[VAL_64:.*]] = fir.array_coor %[[VAL_65]](%[[VAL_49]]) %[[VAL_53]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 ! CHECK:   fir.store %[[VAL_63]] to %[[VAL_64]] : !fir.ref<i32>
@@ -143,8 +142,7 @@ subroutine foo4(i, j)
 ! CHECK:   %[[VAL_86:.*]] = arith.cmpi sgt, %[[VAL_85]], %[[VAL_70]] : index
 ! CHECK:   cond_br %[[VAL_86]], ^bb2, ^bb3
 ! CHECK: ^bb2:
-! CHECK:   %[[VAL_87:.*]] = fir.convert %[[VAL_80]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:   %[[VAL_88:.*]] = fir.emboxchar %[[VAL_87]], %[[VAL_71]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:   %[[VAL_88:.*]] = fir.emboxchar %[[VAL_80]], %[[VAL_71]] : (!fir.ref<!fir.char<1>>, index) -> !fir.boxchar<1>
 ! CHECK:   %[[VAL_89:.*]] = arith.addi %[[VAL_84]], %[[VAL_71]] : index
 ! CHECK:   %[[VAL_90:.*]] = fir.array_coor %[[VAL_74]](%[[VAL_72]]) %[[VAL_89]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 ! CHECK:   %[[VAL_91:.*]] = fir.call @_QPelem(%[[VAL_88]], %[[VAL_90]]) {{.*}}: (!fir.boxchar<1>, !fir.ref<i32>) -> i32
@@ -181,8 +179,7 @@ subroutine foo5(i, j)
 ! CHECK:   %[[VAL_108:.*]] = arith.cmpi sgt, %[[VAL_107]], %[[VAL_98]] : index
 ! CHECK:   cond_br %[[VAL_108]], ^bb2, ^bb3
 ! CHECK: ^bb2:
-! CHECK:   %[[VAL_109:.*]] = fir.convert %[[VAL_102]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:   %[[VAL_110:.*]] = fir.emboxchar %[[VAL_109]], %[[VAL_95]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:   %[[VAL_110:.*]] = fir.emboxchar %[[VAL_102]], %[[VAL_95]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 ! CHECK:   %[[VAL_111:.*]] = arith.addi %[[VAL_106]], %[[VAL_99]] : index
 ! CHECK:   %[[VAL_112:.*]] = fir.array_coor %[[VAL_113]](%[[VAL_100]]) %[[VAL_111]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 ! CHECK:   %[[VAL_114:.*]] = fir.call @_QPelem(%[[VAL_110]], %[[VAL_112]]) {{.*}}: (!fir.boxchar<1>, !fir.ref<i32>) -> i32
diff --git a/flang/test/Lower/array-elemental-calls-char.f90 b/flang/test/Lower/array-elemental-calls-char.f90
index 9dea4d224f9400..d2a62899b377b7 100644
--- a/flang/test/Lower/array-elemental-calls-char.f90
+++ b/flang/test/Lower/array-elemental-calls-char.f90
@@ -65,8 +65,7 @@ subroutine foo1b(i, c)
 ! CHECK: ^bb2:
 ! CHECK:   %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_19]] : index
 ! CHECK:   %[[VAL_28:.*]] = fir.array_coor %[[VAL_22]](%[[VAL_23]]) %[[VAL_27]] : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index) -> !fir.ref<!fir.char<1,10>>
-! CHECK:   %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:   %[[VAL_30:.*]] = fir.emboxchar %[[VAL_29]], %[[VAL_17]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:   %[[VAL_30:.*]] = fir.emboxchar %[[VAL_28]], %[[VAL_17]] : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
 ! CHECK:   %[[VAL_31:.*]] = fir.call @_QPelem(%[[VAL_30]]) {{.*}}: (!fir.boxchar<1>) -> i32
 ! CHECK:   %[[VAL_32:.*]] = fir.array_coor %[[VAL_33]](%[[VAL_23]]) %[[VAL_27]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 ! CHECK:   fir.store %[[VAL_31]] to %[[VAL_32]] : !fir.ref<i32>
@@ -121,8 +120,7 @@ subroutine foo2b(i, j, c)
 ! CHECK:           %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_4]] : index
 ! CHECK:           cf.cond_br %[[VAL_11]], ^bb2, ^bb3
 ! CHECK:         ^bb2:
-! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:           %[[VAL_13:.*]] = fir.emboxchar %[[VAL_12]], %[[VAL_3]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_13:.*]] = fir.emboxchar %[[VAL_7]], %[[VAL_3]] : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
 ! CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_9]], %[[VAL_5]] : index
 ! CHECK:           %[[VAL_15:.*]] = fir.array_coor %[[VAL_1]](%[[VAL_8]]) %[[VAL_14]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_16:.*]] = fir.call @_QPelem2(%[[VAL_13]], %[[VAL_15]]) fastmath<contract> : (!fir.boxchar<1>, !fir.ref<i32>) -> i32
@@ -157,8 +155,7 @@ subroutine foo3(i, j)
 ! CHECK:   %[[VAL_82:.*]] = fir.undefined !fir.char<1>
 ! CHECK:   %[[VAL_83:.*]] = fir.insert_value %[[VAL_82]], %[[VAL_81]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1>
 ! CHECK:   fir.store %[[VAL_83]] to %[[VAL_72]] : !fir.ref<!fir.char<1>>
-! CHECK:   %[[VAL_84:.*]] = fir.convert %[[VAL_72]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:   %[[VAL_85:.*]] = fir.emboxchar %[[VAL_84]], %[[VAL_71]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:   %[[VAL_85:.*]] = fir.emboxchar %[[VAL_72]], %[[VAL_71]] : (!fir.ref<!fir.char<1>>, index) -> !fir.boxchar<1>
 ! CHECK:   %[[VAL_86:.*]] = fir.call @_QPelem(%[[VAL_85]]) {{.*}}: (!fir.boxchar<1>) -> i32
 ! CHECK:   %[[VAL_87:.*]] = fir.array_coor %[[VAL_88]](%[[VAL_73]]) %[[VAL_77]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 ! CHECK:   fir.store %[[VAL_86]] to %[[VAL_87]] : !fir.ref<i32>
@@ -184,8 +181,7 @@ subroutine foo4(i, j)
 ! CHECK:   %[[VAL_98:.*]] = arith.cmpi sgt, %[[VAL_97]], %[[VAL_92]] : index
 ! CHECK:   cond_br %[[VAL_98]], ^bb2, ^bb3
 ! CHECK: ^bb2:
-! CHECK:   %[[VAL_99:.*]] = fir.convert %[[VAL_95]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:   %[[VAL_100:.*]] = fir.emboxchar %[[VAL_99]], %[[VAL_90]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:   %[[VAL_100:.*]] = fir.emboxchar %[[VAL_95]], %[[VAL_90]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 ! CHECK:   %[[VAL_101:.*]] = arith.addi %[[VAL_96]], %[[VAL_93]] : index
 ! CHECK:   %[[VAL_102:.*]] = fir.array_coor %[[VAL_103]](%[[VAL_94]]) %[[VAL_101]] : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 ! CHECK:   %[[VAL_104:.*]] = fir.call @_QPelem2(%[[VAL_100]], %[[VAL_102]]) {{.*}}: (!fir.boxchar<1>, !fir.ref<i32>) -> i32
diff --git a/flang/test/Lower/array-expression.f90 b/flang/test/Lower/array-expression.f90
index 820ca42e95acf0..424aee449bd34e 100644
--- a/flang/test/Lower/array-expression.f90
+++ b/flang/test/Lower/array-expression.f90
@@ -674,8 +674,8 @@ end subroutine test19a
 ! CHECK:         %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<2,?>>) -> !fir.ref<!fir.array<20x!fir.char<2,8>>>
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 20 : index
 ! CHECK:         %[[VAL_5:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-! CHECK:         %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:         %[[VAL_7:.*]] = fir.convert %[[VAL_5]]#0 : (!fir.ref<!fir.char<2,?>>) -> !fir.ref<!fir.array<20x!fir.char<2,10>>>
+! CHECK:         %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:         %[[VAL_8:.*]] = arith.constant 20 : index
 ! CHECK:         %[[VAL_9:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
 ! CHECK:         %[[VAL_10:.*]] = fir.array_load %[[VAL_3]](%[[VAL_9]]) : (!fir.ref<!fir.array<20x!fir.char<2,8>>>, !fir.shape<1>) -> !fir.array<20x!fir.char<2,8>>
@@ -724,15 +724,15 @@ end subroutine test19b
 ! CHECK-LABEL: func @_QPtest19c(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<4>{{.*}}, %[[VAL_1:.*]]: !fir.boxchar<4>{{.*}}, %[[VAL_2:.*]]: !fir.ref<i32>{{.*}}) {
 ! CHECK:         %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:         %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<4,?>>) -> !fir.ref<!fir.array<30x!fir.char<4,10>>>
+! CHECK:         %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 30 : index
 ! CHECK:         %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
+! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<4,?>>) -> !fir.ref<!fir.array<30x!fir.char<4,?>>>
 ! CHECK:         %[[VAL_8:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_9:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : i32
 ! CHECK:         %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : i32
-! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<4,?>>) -> !fir.ref<!fir.array<30x!fir.char<4,?>>>
 ! CHECK:         %[[VAL_13:.*]] = arith.constant 30 : index
 ! CHECK:         %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
 ! CHECK:         %[[VAL_15:.*]] = fir.array_load %[[VAL_12]](%[[VAL_14]]) typeparams %[[VAL_11]] : (!fir.ref<!fir.array<30x!fir.char<4,?>>>, !fir.shape<1>, i32) -> !fir.array<30x!fir.char<4,?>>
@@ -782,18 +782,18 @@ end subroutine test19c
 ! CHECK-LABEL: func @_QPtest19d(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<1>{{.*}}, %[[VAL_1:.*]]: !fir.boxchar<1>{{.*}}, %[[VAL_2:.*]]: !fir.ref<i32>{{.*}}, %[[VAL_3:.*]]: !fir.ref<i32>{{.*}}) {
 ! CHECK:         %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:         %[[VAL_9:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<40x!fir.char<1,?>>>
 ! CHECK:         %[[VAL_5:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : i32
 ! CHECK:         %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : i32
-! CHECK:         %[[VAL_9:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<40x!fir.char<1,?>>>
 ! CHECK:         %[[VAL_10:.*]] = arith.constant 40 : index
 ! CHECK:         %[[VAL_11:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:         %[[VAL_16:.*]] = fir.convert %[[VAL_11]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<40x!fir.char<1,?>>>
 ! CHECK:         %[[VAL_12:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_13:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_13]] : i32
 ! CHECK:         %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_12]], %[[VAL_13]] : i32
-! CHECK:         %[[VAL_16:.*]] = fir.convert %[[VAL_11]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<40x!fir.char<1,?>>>
 ! CHECK:         %[[VAL_17:.*]] = arith.constant 40 : index
 ! CHECK:         %[[VAL_18:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1>
 ! CHECK:         %[[VAL_19:.*]] = fir.array_load %[[VAL_9]](%[[VAL_18]]) typeparams %[[VAL_8]] : (!fir.ref<!fir.array<40x!fir.char<1,?>>>, !fir.shape<1>, i32) -> !fir.array<40x!fir.char<1,?>>
@@ -968,15 +968,15 @@ end subroutine test19f
 ! CHECK-LABEL: func @_QPtest19g(
 ! CHECK-SAME:            %[[VAL_0:.*]]: !fir.boxchar<4>{{.*}}, %[[VAL_1:.*]]: !fir.boxchar<2>{{.*}}, %[[VAL_2:.*]]: !fir.ref<i32>{{.*}}) {
 ! CHECK:         %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-! CHECK:         %[[VAL_4:.*]] = arith.constant 13 : index
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<2,?>>) -> !fir.ref<!fir.array<140x!fir.char<2,13>>>
+! CHECK:         %[[VAL_4:.*]] = arith.constant 13 : index
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 140 : index
 ! CHECK:         %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
+! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<4,?>>) -> !fir.ref<!fir.array<70x!fir.char<4,?>>>
 ! CHECK:         %[[VAL_8:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_9:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : i32
 ! CHECK:         %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : i32
-! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<4,?>>) -> !fir.ref<!fir.array<70x!fir.char<4,?>>>
 ! CHECK:         %[[VAL_13:.*]] = arith.constant 70 : index
 ! CHECK:         %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
 ! CHECK:         %[[VAL_15:.*]] = fir.array_load %[[VAL_12]](%[[VAL_14]]) typeparams %[[VAL_11]] : (!fir.ref<!fir.array<70x!fir.char<4,?>>>, !fir.shape<1>, i32) -> !fir.array<70x!fir.char<4,?>>
@@ -1038,11 +1038,11 @@ end subroutine test19g
 ! CHECK-LABEL: func @_QPtest19h(
 ! CHECK-SAME:       %[[VAL_0:.*]]: !fir.boxchar<1>{{.*}}, %[[VAL_1:.*]]: !fir.boxchar<1>{{.*}}, %[[VAL_2:.*]]: !fir.ref<i32>{{.*}}, %[[VAL_3:.*]]: !fir.ref<i32>{{.*}}) {
 ! CHECK:         %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:         %[[VAL_9:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<70x!fir.char<1,?>>>
 ! CHECK:         %[[VAL_5:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : i32
 ! CHECK:         %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : i32
-! CHECK:         %[[VAL_9:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<70x!fir.char<1,?>>>
 ! CHECK:         %[[VAL_10:.*]] = arith.constant 70 : index
 ! CHECK:         %[[VAL_11:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_11]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
diff --git a/flang/test/Lower/array.f90 b/flang/test/Lower/array.f90
index e6e12862858385..3371887d041279 100644
--- a/flang/test/Lower/array.f90
+++ b/flang/test/Lower/array.f90
@@ -4,7 +4,7 @@
 ! CHECK-DAG: %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f32
 ! CHECK-DAG: %[[VAL_2:.*]] = arith.constant 2.400000e+00 : f32
 ! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK: %[[VAL_4:.*]] = fir.undefined tuple<!fir.array<5x5xf32>>
+! CHECK: %[[VAL_4:.*]] = fir.zero_bits tuple<!fir.array<5x5xf32>>
 ! CHECK: %[[VAL_5:.*]] = fir.undefined !fir.array<5x5xf32>
 ! CHECK: %[[VAL_6:.*]] = fir.insert_on_range %[[VAL_5]], %[[VAL_1]] from (0, 0) to (1, 0) : (!fir.array<5x5xf32>, f32) -> !fir.array<5x5xf32>
 ! CHECK: %[[VAL_7:.*]] = fir.insert_on_range %[[VAL_6]], %[[VAL_3]] from (2, 0) to (4, 0) : (!fir.array<5x5xf32>, f32) -> !fir.array<5x5xf32>
diff --git a/flang/test/Lower/call-by-value-attr.f90 b/flang/test/Lower/call-by-value-attr.f90
index 33060e0e9401fc..dddc8e766fd289 100644
--- a/flang/test/Lower/call-by-value-attr.f90
+++ b/flang/test/Lower/call-by-value-attr.f90
@@ -137,8 +137,7 @@ subroutine takes_char_value(v)
   ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.char<1,71>>) -> !fir.ref<i8>
   ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1,71>>) -> !fir.ref<i8>
   ! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_7]], %[[VAL_8]], %[[VAL_5]], %[[VAL_6]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-  ! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.char<1,71>>) -> !fir.ref<!fir.char<1,?>>
-  ! CHECK: %[[VAL_10:.*]] = fir.emboxchar %[[VAL_9]], %[[VAL_1]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: %[[VAL_10:.*]] = fir.emboxchar %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.char<1,71>>, index) -> !fir.boxchar<1>
   ! CHECK: fir.call @_QPtakes_char_value(%[[VAL_10]]) {{.*}}: (!fir.boxchar<1>) -> ()
   call takes_char_value("a character string litteral that could be locally modfied by the callee")
 end subroutine
diff --git a/flang/test/Lower/call-by-value.f90 b/flang/test/Lower/call-by-value.f90
index de11a27d86c34b..b7fc8fbac496ae 100644
--- a/flang/test/Lower/call-by-value.f90
+++ b/flang/test/Lower/call-by-value.f90
@@ -65,10 +65,9 @@ subroutine test_complex_value(x) bind(c)
 ! CHECK-LABEL:   func.func @test_char_value(
 ! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "x"}) attributes {fir.bindc_name = "test_char_value"} {
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1>>
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:           %[[VAL_5:.*]] = fir.emboxchar %[[VAL_4]], %[[VAL_2]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_5:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_2]] : (!fir.ref<!fir.char<1>>, index) -> !fir.boxchar<1>
 ! CHECK:           fir.call @_QPinternal_call4(%[[VAL_5]]) fastmath<contract> : (!fir.boxchar<1>) -> ()
 ! CHECK:           return
 ! CHECK:         }
diff --git a/flang/test/Lower/call-implicit.f90 b/flang/test/Lower/call-implicit.f90
index e59736a87d4fe2..7c9356fee644aa 100644
--- a/flang/test/Lower/call-implicit.f90
+++ b/flang/test/Lower/call-implicit.f90
@@ -7,8 +7,7 @@ subroutine s2
   ! CHECK: fir.call @_QPsub2(%[[a0]]) {{.*}}: (!fir.ref<!fir.array<3xi32>>) -> ()
   call sub2(i)
 ! CHECK:  %[[a1:.*]] = fir.address_of(@_QQcl.3031323334) : !fir.ref<!fir.char<1,5>>
-! CHECK:  %[[a2:.*]] = fir.convert %[[a1]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:  %[[a3:.*]] = fir.convert %[[a2]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3xi32>>
-  ! CHECK: fir.call @_QPsub2(%[[a3]]) {{.*}}: (!fir.ref<!fir.array<3xi32>>) -> ()
+! CHECK:  %[[a2:.*]] = fir.convert %[[a1]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.array<3xi32>>
+  ! CHECK: fir.call @_QPsub2(%[[a2]]) {{.*}}: (!fir.ref<!fir.array<3xi32>>) -> ()
   call sub2("01234")
 end
diff --git a/flang/test/Lower/call-parenthesized-arg.f90 b/flang/test/Lower/call-parenthesized-arg.f90
index de5b4d7658dfc6..8f164fed51a39f 100644
--- a/flang/test/Lower/call-parenthesized-arg.f90
+++ b/flang/test/Lower/call-parenthesized-arg.f90
@@ -23,10 +23,9 @@ subroutine foo_num_scalar(x)
 subroutine foo_char_scalar(x)
   character(5) :: x
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_2:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,5>>
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:           %[[VAL_5:.*]] = fir.emboxchar %[[VAL_4]], %[[VAL_2]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_2:.*]] = arith.constant 5 : index
+! CHECK:           %[[VAL_5:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_2]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 ! CHECK:           fir.call @_QPbar_char_scalar(%[[VAL_5]]) fastmath<contract> : (!fir.boxchar<1>) -> ()
   call bar_char_scalar(x)
 ! CHECK:           %[[VAL_6:.*]] = fir.no_reassoc %[[VAL_3]] : !fir.ref<!fir.char<1,5>>
@@ -38,8 +37,7 @@ subroutine foo_char_scalar(x)
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
 ! CHECK:           fir.call @llvm.memmove.p0.p0.i64(%[[VAL_12]], %[[VAL_13]], %[[VAL_10]], %[[VAL_11]]) fastmath<contract> : (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:           %[[VAL_15:.*]] = fir.emboxchar %[[VAL_14]], %[[VAL_2]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_15:.*]] = fir.emboxchar %[[VAL_7]], %[[VAL_2]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
 ! CHECK:           fir.call @_QPbar_char_scalar(%[[VAL_15]]) fastmath<contract> : (!fir.boxchar<1>) -> ()
 ! CHECK:           return
 ! CHECK:         }
@@ -81,8 +79,8 @@ subroutine foo_num_array(x)
 ! CHECK-SAME:                          %[[VAL_0:.*]]: !fir.boxchar<1>{{.*}}) {
 subroutine foo_char_array(x)
   ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-  ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index
   ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,10>>>
+  ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index
   ! CHECK: %[[VAL_4:.*]] = arith.constant 100 : index
   ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<100x!fir.char<1,10>>>) -> !fir.ref<!fir.char<1,?>>
   ! CHECK: %[[VAL_6:.*]] = fir.emboxchar %[[VAL_5]], %[[VAL_2]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
diff --git a/flang/test/Lower/common-block-2.f90 b/flang/test/Lower/common-block-2.f90
index 31916f4be9fcb2..4b12860a48ffae 100644
--- a/flang/test/Lower/common-block-2.f90
+++ b/flang/test/Lower/common-block-2.f90
@@ -6,12 +6,12 @@
 ! - A common block that is initialized outside of a BLOCK DATA.
 
 ! CHECK-LABEL: fir.global @__BLNK__ : tuple<i32, !fir.array<8xi8>> {
-! CHECK:  %[[undef:.*]] = fir.undefined tuple<i32, !fir.array<8xi8>>
+! CHECK:  %[[undef:.*]] = fir.zero_bits tuple<i32, !fir.array<8xi8>>
 ! CHECK:  %[[init:.*]] = fir.insert_value %[[undef]], %c42{{.*}}, [0 : index] : (tuple<i32, !fir.array<8xi8>>, i32) -> tuple<i32, !fir.array<8xi8>>
 ! CHECK:  fir.has_value %[[init]] : tuple<i32, !fir.array<8xi8>>
 
 ! CHECK-LABEL: fir.global @a_ : tuple<i32, !fir.array<8xi8>> {
-! CHECK:  %[[undef:.*]] = fir.undefined tuple<i32, !fir.array<8xi8>>
+! CHECK:  %[[undef:.*]] = fir.zero_bits tuple<i32, !fir.array<8xi8>>
 ! CHECK:  %[[init:.*]] = fir.insert_value %[[undef]], %c42{{.*}}, [0 : index] : (tuple<i32, !fir.array<8xi8>>, i32) -> tuple<i32, !fir.array<8xi8>>
 ! CHECK:  fir.has_value %[[init]] : tuple<i32, !fir.array<8xi8>>
 
diff --git a/flang/test/Lower/common-block.f90 b/flang/test/Lower/common-block.f90
index bd3fab507c0ef4..3934b71b75694e 100644
--- a/flang/test/Lower/common-block.f90
+++ b/flang/test/Lower/common-block.f90
@@ -6,7 +6,7 @@
 ! CHECK: @with_empty_equiv_ = common global [8 x i8] zeroinitializer
 ! CHECK: @x_ = global { float, float } { float 1.0{{.*}}, float 2.0{{.*}} }
 ! CHECK: @y_ = common global [12 x i8] zeroinitializer
-! CHECK: @z_ = global { i32, [4 x i8], float } { i32 42, [4 x i8] undef, float 3.000000e+00 }
+! CHECK: @z_ = global { i32, [4 x i8], float } { i32 42, [4 x i8] zeroinitializer, float 3.000000e+00 }
 
 ! CHECK-LABEL: _QPs0
 subroutine s0
diff --git a/flang/test/Lower/default-initialization-globals.f90 b/flang/test/Lower/default-initialization-globals.f90
index bbcea75a5b1f97..2b5e3c36367e2e 100644
--- a/flang/test/Lower/default-initialization-globals.f90
+++ b/flang/test/Lower/default-initialization-globals.f90
@@ -97,7 +97,7 @@ module tinit
   ! CHECK-DAG: %[[VAL_15:.*]] = arith.constant 66 : i32
   ! CHECK: %[[VAL_16:.*]] = fir.undefined !fir.type<_QMtinitTt1{{.*}}>
   ! CHECK: %[[VAL_17:.*]] = fir.insert_value %[[VAL_16]], %[[VAL_11]], ["i", !fir.type<_QMtinitTt1{{.*}}>] : (!fir.type<_QMtinitTt1{{.*}}>, i32) -> !fir.type<_QMtinitTt1{{.*}}>
-  ! CHECK: %[[VAL_18:.*]] = fir.undefined i32
+  ! CHECK: %[[VAL_18:.*]] = fir.zero_bits i32
   ! CHECK: %[[VAL_19:.*]] = fir.insert_value %[[VAL_17]], %[[VAL_18]], ["j", !fir.type<_QMtinitTt1{{.*}}>] : (!fir.type<_QMtinitTt1{{.*}}>, i32) -> !fir.type<_QMtinitTt1{{.*}}>
   ! CHECK: %[[VAL_20:.*]] = fir.address_of(@_QMtinitEziel) : !fir.ref<!fir.array<100xf32>>
   ! CHECK: %[[VAL_21:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -111,12 +111,12 @@ module tinit
   ! CHECK: %[[VAL_28:.*]] = fir.insert_value %[[VAL_27]], %[[VAL_26]], ["z", !fir.type<_QMtinitTt1{{.*}}>] : (!fir.type<_QMtinitTt1{{.*}}>, !fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.type<_QMtinitTt1{{.*}}>
   ! CHECK: %[[VAL_29:.*]] = fir.string_lit "hello     "(10) : !fir.char<1,10>
   ! CHECK: %[[VAL_30:.*]] = fir.insert_value %[[VAL_28]], %[[VAL_29]], ["c1", !fir.type<_QMtinitTt1{{.*}}>] : (!fir.type<_QMtinitTt1{{.*}}>, !fir.char<1,10>) -> !fir.type<_QMtinitTt1{{.*}}>
-  ! CHECK: %[[VAL_31:.*]] = fir.undefined !fir.char<1,10>
+  ! CHECK: %[[VAL_31:.*]] = fir.zero_bits !fir.char<1,10>
   ! CHECK: %[[VAL_32:.*]] = fir.insert_value %[[VAL_30]], %[[VAL_31]], ["c2", !fir.type<_QMtinitTt1{{.*}}>] : (!fir.type<_QMtinitTt1{{.*}}>, !fir.char<1,10>) -> !fir.type<_QMtinitTt1{{.*}}>
   ! CHECK: %[[VAL_33:.*]] = fir.undefined !fir.type<_QMtinitTt0{k:i32}>
   ! CHECK: %[[VAL_34:.*]] = fir.insert_value %[[VAL_33]], %[[VAL_15]], ["k", !fir.type<_QMtinitTt0{k:i32}>] : (!fir.type<_QMtinitTt0{k:i32}>, i32) -> !fir.type<_QMtinitTt0{k:i32}>
   ! CHECK: %[[VAL_35:.*]] = fir.insert_value %[[VAL_32]], %[[VAL_34]], ["somet0", !fir.type<_QMtinitTt1{{.*}}>] : (!fir.type<_QMtinitTt1{{.*}}>, !fir.type<_QMtinitTt0{k:i32}>) -> !fir.type<_QMtinitTt1{{.*}}>
-  ! CHECK: %[[VAL_36:.*]] = fir.undefined !fir.type<_QMtinitTtno_init{k:i32}>
+  ! CHECK: %[[VAL_36:.*]] = fir.zero_bits !fir.type<_QMtinitTtno_init{k:i32}>
   ! CHECK: %[[VAL_37:.*]] = fir.insert_value %[[VAL_35]], %[[VAL_36]], ["sometno_init", !fir.type<_QMtinitTt1{{.*}}>] : (!fir.type<_QMtinitTt1{{.*}}>, !fir.type<_QMtinitTtno_init{k:i32}>) -> !fir.type<_QMtinitTt1{{.*}}>
   ! CHECK: %[[VAL_38:.*]] = fir.insert_value %[[VAL_33]], %[[VAL_14]], ["k", !fir.type<_QMtinitTt0{k:i32}>] : (!fir.type<_QMtinitTt0{k:i32}>, i32) -> !fir.type<_QMtinitTt0{k:i32}>
   ! CHECK: %[[VAL_39:.*]] = fir.insert_value %[[VAL_37]], %[[VAL_38]], ["somet0_2", !fir.type<_QMtinitTt1{{.*}}>] : (!fir.type<_QMtinitTt1{{.*}}>, !fir.type<_QMtinitTt0{k:i32}>) -> !fir.type<_QMtinitTt1{{.*}}>
@@ -129,7 +129,7 @@ module tinit
   ! CHECK: %[[VAL_42:.*]] = arith.constant 66 : i32
   ! CHECK: %[[VAL_43:.*]] = fir.undefined !fir.type<_QMtinitTtextendst0{k:i32,l:i32}>
   ! CHECK: %[[VAL_44:.*]] = fir.insert_value %[[VAL_43]], %[[VAL_42]], ["k", !fir.type<_QMtinitTtextendst0{k:i32,l:i32}>] : (!fir.type<_QMtinitTtextendst0{k:i32,l:i32}>, i32) -> !fir.type<_QMtinitTtextendst0{k:i32,l:i32}>
-  ! CHECK: %[[VAL_45:.*]] = fir.undefined i32
+  ! CHECK: %[[VAL_45:.*]] = fir.zero_bits i32
   ! CHECK: %[[VAL_46:.*]] = fir.insert_value %[[VAL_44]], %[[VAL_45]], ["l", !fir.type<_QMtinitTtextendst0{k:i32,l:i32}>] : (!fir.type<_QMtinitTtextendst0{k:i32,l:i32}>, i32) -> !fir.type<_QMtinitTtextendst0{k:i32,l:i32}>
   ! CHECK: fir.has_value %[[VAL_46]] : !fir.type<_QMtinitTtextendst0{k:i32,l:i32}>
 
diff --git a/flang/test/Lower/derived-allocatable-components.f90 b/flang/test/Lower/derived-allocatable-components.f90
index ced1712543f9c3..ded7b85186a3fd 100644
--- a/flang/test/Lower/derived-allocatable-components.f90
+++ b/flang/test/Lower/derived-allocatable-components.f90
@@ -179,8 +179,7 @@ subroutine ref_scalar_cst_char_a(a0_0, a1_0, a0_1, a1_1)
   ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[a0_0]], %[[fld]]
   ! CHECK: %[[box:.*]] = fir.load %[[coor]]
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]]
-  ! CHECK: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %c10{{.*}}
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}}
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(a0_0%p)
 
@@ -189,8 +188,7 @@ subroutine ref_scalar_cst_char_a(a0_0, a1_0, a0_1, a1_1)
   ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[coor0]], %[[fld]]
   ! CHECK: %[[box:.*]] = fir.load %[[coor]]
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]]
-  ! CHECK: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %c10{{.*}}
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}}
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(a0_1(5)%p)
 
@@ -203,8 +201,7 @@ subroutine ref_scalar_cst_char_a(a0_0, a1_0, a0_1, a1_1)
   ! CHECK: %[[lb:.*]] = fir.convert %[[dims]]#0 : (index) -> i64
   ! CHECK: %[[index:.*]] = arith.subi %c7{{.*}}, %[[lb]] : i64
   ! CHECK: %[[addr:.*]] = fir.coordinate_of %[[base]], %[[index]]
-  ! CHECK: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %c10{{.*}}
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}}
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(a1_0%p(7))
 
@@ -218,8 +215,7 @@ subroutine ref_scalar_cst_char_a(a0_0, a1_0, a0_1, a1_1)
   ! CHECK: %[[lb:.*]] = fir.convert %[[dims]]#0 : (index) -> i64
   ! CHECK: %[[index:.*]] = arith.subi %c7{{.*}}, %[[lb]] : i64
   ! CHECK: %[[addr:.*]] = fir.coordinate_of %[[base]], %[[index]]
-  ! CHECK: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %c10{{.*}}
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}}
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(a1_1(5)%p(7))
 
@@ -236,8 +232,7 @@ subroutine ref_scalar_def_char_a(a0_0, a1_0, a0_1, a1_1)
   ! CHECK: %[[box:.*]] = fir.load %[[coor]]
   ! CHECK-DAG: %[[len:.*]] = fir.box_elesize %[[box]]
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]]
-  ! CHECK-DAG: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %[[len]]
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %[[len]]
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(a0_0%p)
 
@@ -247,8 +242,7 @@ subroutine ref_scalar_def_char_a(a0_0, a1_0, a0_1, a1_1)
   ! CHECK: %[[box:.*]] = fir.load %[[coor]]
   ! CHECK-DAG: %[[len:.*]] = fir.box_elesize %[[box]]
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]]
-  ! CHECK-DAG: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %[[len]]
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %[[len]]
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(a0_1(5)%p)
 
diff --git a/flang/test/Lower/derived-pointer-components.f90 b/flang/test/Lower/derived-pointer-components.f90
index d01f6bb91d42b0..a989a7c8a9bce4 100644
--- a/flang/test/Lower/derived-pointer-components.f90
+++ b/flang/test/Lower/derived-pointer-components.f90
@@ -212,8 +212,7 @@ subroutine ref_scalar_cst_char_p(p0_0, p1_0, p0_1, p1_1)
   ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[p0_0]], %[[fld]]
   ! CHECK: %[[box:.*]] = fir.load %[[coor]]
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]]
-  ! CHECK: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %c10{{.*}}
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}}
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(p0_0%p)
 
@@ -222,8 +221,7 @@ subroutine ref_scalar_cst_char_p(p0_0, p1_0, p0_1, p1_1)
   ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[coor0]], %[[fld]]
   ! CHECK: %[[box:.*]] = fir.load %[[coor]]
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]]
-  ! CHECK: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %c10{{.*}}
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}}
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(p0_1(5)%p)
 
@@ -235,8 +233,7 @@ subroutine ref_scalar_cst_char_p(p0_0, p1_0, p0_1, p1_1)
   ! CHECK: %[[lb:.*]] = fir.convert %[[dims]]#0 : (index) -> i64
   ! CHECK: %[[index:.*]] = arith.subi %c7{{.*}}, %[[lb]]
   ! CHECK: %[[addr:.*]] = fir.coordinate_of %[[box]], %[[index]]
-  ! CHECK: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %c10{{.*}}
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}}
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(p1_0%p(7))
 
@@ -249,8 +246,7 @@ subroutine ref_scalar_cst_char_p(p0_0, p1_0, p0_1, p1_1)
   ! CHECK: %[[lb:.*]] = fir.convert %[[dims]]#0 : (index) -> i64
   ! CHECK: %[[index:.*]] = arith.subi %c7{{.*}}, %[[lb]]
   ! CHECK: %[[addr:.*]] = fir.coordinate_of %[[box]], %[[index]]
-  ! CHECK: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %c10{{.*}}
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %c10{{.*}}
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(p1_1(5)%p(7))
 
@@ -267,8 +263,7 @@ subroutine ref_scalar_def_char_p(p0_0, p1_0, p0_1, p1_1)
   ! CHECK: %[[box:.*]] = fir.load %[[coor]]
   ! CHECK-DAG: %[[len:.*]] = fir.box_elesize %[[box]]
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]]
-  ! CHECK-DAG: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %[[len]]
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %[[len]]
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(p0_0%p)
 
@@ -278,8 +273,7 @@ subroutine ref_scalar_def_char_p(p0_0, p1_0, p0_1, p1_1)
   ! CHECK: %[[box:.*]] = fir.load %[[coor]]
   ! CHECK-DAG: %[[len:.*]] = fir.box_elesize %[[box]]
   ! CHECK-DAG: %[[addr:.*]] = fir.box_addr %[[box]]
-  ! CHECK-DAG: %[[cast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[cast]], %[[len]]
+  ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[addr]], %[[len]]
   ! CHECK: fir.call @_QPtakes_char_scalar(%[[boxchar]])
   call takes_char_scalar(p0_1(5)%p)
 
diff --git a/flang/test/Lower/derived-types.f90 b/flang/test/Lower/derived-types.f90
index f68277f5044eb8..53e83bfdd27bb2 100644
--- a/flang/test/Lower/derived-types.f90
+++ b/flang/test/Lower/derived-types.f90
@@ -67,8 +67,7 @@ subroutine scalar_character_ref()
   ! CHECK: %[[field:.*]] = fir.field_index ch, !fir.type<_QMdTc{ch:!fir.char<1,10>}>
   ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[alloc]], %[[field]] : (!fir.ref<!fir.type<_QMdTc{ch:!fir.char<1,10>}>>, !fir.field) -> !fir.ref<!fir.char<1,10>>
   ! CHECK-DAG: %[[c10:.*]] = arith.constant 10 : index
-  ! CHECK-DAG: %[[conv:.*]] = fir.convert %[[coor]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,?>>
-  ! CHECK: fir.emboxchar %[[conv]], %c10 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: fir.emboxchar %[[coor]], %c10 : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
   call char_bar(some_c%ch)
 end subroutine
 
@@ -95,7 +94,7 @@ subroutine char_array_comp_elt_ref()
   ! CHECK-DAG: %[[index1:.*]] = arith.subi %c5{{.*}}, %c1{{.*}} : i64
   ! CHECK-DAG: %[[index2:.*]] = arith.subi %c6{{.*}}, %c1{{.*}} : i64
   ! CHECK: fir.coordinate_of %[[coor]], %[[index1]], %[[index2]] : (!fir.ref<!fir.array<20x30x!fir.char<1,10>>>, i64, i64) -> !fir.ref<!fir.char<1,10>>
-  ! CHECK: fir.emboxchar %{{.*}}, %c10 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: fir.emboxchar %{{.*}}, %c10 : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
   call char_bar(some_c2%ch_array(5, 6))
 end subroutine
 
@@ -115,7 +114,7 @@ subroutine char_array_elt_comp_ref()
   type(c) :: some_c_array(100)
   ! CHECK: fir.coordinate_of %{{.*}}, %{{.*}} : (!fir.ref<!fir.array<100x!fir.type<_QMdTc{ch:!fir.char<1,10>}>>>, i64) -> !fir.ref<!fir.type<_QMdTc{ch:!fir.char<1,10>}>>
   ! CHECK: fir.coordinate_of %{{.*}}, %{{.*}} : (!fir.ref<!fir.type<_QMdTc{ch:!fir.char<1,10>}>>, !fir.field) -> !fir.ref<!fir.char<1,10>>
-  ! CHECK: fir.emboxchar %{{.*}}, %c10{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK: fir.emboxchar %{{.*}}, %c10{{.*}} : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
   call char_bar(some_c_array(5)%ch)
 end subroutine
 
diff --git a/flang/test/Lower/dispatch-table.f90 b/flang/test/Lower/dispatch-table.f90
index 7b5a5ec4a39e62..0df4981b3eaf86 100644
--- a/flang/test/Lower/dispatch-table.f90
+++ b/flang/test/Lower/dispatch-table.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc -polymorphic-type -emit-fir %s -o - | FileCheck %s
 
-! Tests the generation of fir.dispatch_table operations.
+! Tests the generation of fir.type_info operations.
 
 module polymorphic_types
   type p1
@@ -53,20 +53,20 @@ subroutine aproc3(p)
 
 end module
 
-! CHECK-LABEL: fir.dispatch_table @_QMpolymorphic_typesTp1 {
+! CHECK-LABEL: fir.type_info @_QMpolymorphic_typesTp1
 ! CHECK:         fir.dt_entry "aproc", @_QMpolymorphic_typesPaproc
 ! CHECK:         fir.dt_entry "proc1", @_QMpolymorphic_typesPproc1_p1
 ! CHECK:         fir.dt_entry "zproc", @_QMpolymorphic_typesPzproc
 ! CHECK:       }
 
-! CHECK-LABEL: fir.dispatch_table @_QMpolymorphic_typesTp2 extends("_QMpolymorphic_typesTp1") {
+! CHECK-LABEL: fir.type_info @_QMpolymorphic_typesTp2 {{.*}}extends !fir.type<_QMpolymorphic_typesTp1{{.*}}>
 ! CHECK:         fir.dt_entry "aproc", @_QMpolymorphic_typesPaproc
 ! CHECK:         fir.dt_entry "proc1", @_QMpolymorphic_typesPproc1_p2
 ! CHECK:         fir.dt_entry "zproc", @_QMpolymorphic_typesPzproc
 ! CHECK:         fir.dt_entry "aproc2", @_QMpolymorphic_typesPaproc2
 ! CHECK:       }
 
-! CHECK-LABEL: fir.dispatch_table @_QMpolymorphic_typesTp3 extends("_QMpolymorphic_typesTp2") {
+! CHECK-LABEL: fir.type_info @_QMpolymorphic_typesTp3 {{.*}}extends !fir.type<_QMpolymorphic_typesTp2{{.*}}>
 ! CHECK:         fir.dt_entry "aproc", @_QMpolymorphic_typesPaproc
 ! CHECK:         fir.dt_entry "proc1", @_QMpolymorphic_typesPproc1_p2
 ! CHECK:         fir.dt_entry "zproc", @_QMpolymorphic_typesPzproc
diff --git a/flang/test/Lower/dummy-argument-optional-2.f90 b/flang/test/Lower/dummy-argument-optional-2.f90
index 82b7e612c924e7..e49b709fa9a95f 100644
--- a/flang/test/Lower/dummy-argument-optional-2.f90
+++ b/flang/test/Lower/dummy-argument-optional-2.f90
@@ -64,8 +64,7 @@ subroutine pass_pointer_scalar_char(c)
 ! CHECK:         %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:         %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
 ! CHECK:         %[[VAL_3:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
-! CHECK:         %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ptr<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:         %[[VAL_5:.*]] = fir.emboxchar %[[VAL_4]], %[[VAL_2]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:         %[[VAL_5:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_2]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 ! CHECK:         fir.call @_QPtakes_opt_scalar_char(%[[VAL_5]]) {{.*}}: (!fir.boxchar<1>) -> ()
 end subroutine
 
@@ -77,8 +76,7 @@ subroutine pass_allocatable_scalar_char(c)
 ! CHECK:         %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! CHECK:         %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
 ! CHECK:         %[[VAL_3:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
-! CHECK:         %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.heap<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:         %[[VAL_5:.*]] = fir.emboxchar %[[VAL_4]], %[[VAL_2]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:         %[[VAL_5:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_2]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 ! CHECK:         fir.call @_QPtakes_opt_scalar_char(%[[VAL_5]]) {{.*}}: (!fir.boxchar<1>) -> ()
 end subroutine
 
diff --git a/flang/test/Lower/dummy-argument-optional.f90 b/flang/test/Lower/dummy-argument-optional.f90
index 351a1977abd67f..d4f19398403373 100644
--- a/flang/test/Lower/dummy-argument-optional.f90
+++ b/flang/test/Lower/dummy-argument-optional.f90
@@ -61,8 +61,7 @@ subroutine character_scalar(x)
 subroutine call_character_scalar()
   ! CHECK: %[[addr:.*]] = fir.alloca !fir.char<1,10>
   character(10) :: x
-  ! CHECK: %[[addrCast:.*]] = fir.convert %[[addr]]
-  ! CHECK: %[[x:.*]] = fir.emboxchar %[[addrCast]], {{.*}}
+  ! CHECK: %[[x:.*]] = fir.emboxchar %[[addr]], {{.*}}
   ! CHECK: fir.call @_QMoptPcharacter_scalar(%[[x]]) {{.*}}: (!fir.boxchar<1>) -> ()
   call character_scalar(x)
   ! CHECK: %[[absent:.*]] = fir.absent !fir.boxchar<1>
diff --git a/flang/test/Lower/entry-statement.f90 b/flang/test/Lower/entry-statement.f90
index 6a857c854ff19b..3f1fc1ddabcc78 100644
--- a/flang/test/Lower/entry-statement.f90
+++ b/flang/test/Lower/entry-statement.f90
@@ -249,36 +249,26 @@ subroutine level3a(a, b, m)
 end
 
 ! CHECK-LABEL: @_QPf1
+! CHECK-SAME: %[[V_0:.*]]: !fir.ref<!fir.char<1,5>>
 function f1(n1) result(res1)
-  ! CHECK:   %[[V_0:[0-9]+]] = fir.convert %arg0 : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
   ! CHECK:   %[[V_1:[0-9]+]] = fir.alloca i32 {bindc_name = "n2", uniq_name = "_QFf1En2"}
   ! CHECK:   %[[V_2:[0-9]+]] = fir.alloca tuple<!fir.boxchar<1>, !fir.boxchar<1>>
   ! CHECK:   %[[V_3:[0-9]+]] = fir.coordinate_of %[[V_2]], %c0{{.*}}_i32 : (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
-  ! CHECK:   %[[V_4:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_4:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   fir.store %[[V_4]] to %[[V_3]] : !fir.ref<!fir.boxchar<1>>
   ! CHECK:   %[[V_5:[0-9]+]] = fir.coordinate_of %[[V_2]], %c1{{.*}}_i32 : (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
-  ! CHECK:   %[[V_6:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_6:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   fir.store %[[V_6]] to %[[V_5]] : !fir.ref<!fir.boxchar<1>>
   ! CHECK:   br ^bb1
   ! CHECK: ^bb1:  // pred: ^bb0
   ! CHECK:   %[[V_7:[0-9]+]] = fir.address_of(@_QQcl.6120612061) : !fir.ref<!fir.char<1,5>>
-  ! CHECK:   %[[V_8:[0-9]+]] = arith.cmpi slt, %c5{{.*}}, %c5{{.*}} : index
-  ! CHECK:   %[[V_9:[0-9]+]] = arith.select %[[V_8]], %c5{{.*}}, %c5{{.*}} : index
-  ! CHECK:   %[[V_10:[0-9]+]] = fir.convert %[[V_9]] : (index) -> i64
+  ! CHECK:   %[[V_10:[0-9]+]] = fir.convert %c5{{.*}} : (index) -> i64
   ! CHECK:   %[[V_11:[0-9]+]] = arith.muli %c1{{.*}}_i64, %[[V_10]] : i64
-  ! CHECK:   %[[V_12:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+  ! CHECK:   %[[V_12:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
   ! CHECK:   %[[V_13:[0-9]+]] = fir.convert %[[V_7]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
   ! CHECK:   fir.call @llvm.memmove.p0.p0.i64(%[[V_12]], %[[V_13]], %[[V_11]], %false{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-  ! CHECK:   %[[V_14:[0-9]+]] = arith.subi %c5{{.*}}, %c1{{.*}} : index
-  ! CHECK:   %[[V_15:[0-9]+]] = fir.undefined !fir.char<1>
-  ! CHECK:   %[[V_16:[0-9]+]] = fir.insert_value %[[V_15]], %c32{{.*}}_i8, [0 : index] : (!fir.char<1>, i8) -> !fir.char<1>
-  ! CHECK:   fir.do_loop %arg3 = %[[V_9]] to %[[V_14]] step %c1{{.*}} {
-  ! CHECK:     %[[V_32:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1>>>
-  ! CHECK:     %[[V_33:[0-9]+]] = fir.coordinate_of %[[V_32]], %arg3 : (!fir.ref<!fir.array<?x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
-  ! CHECK:     fir.store %[[V_16]] to %[[V_33]] : !fir.ref<!fir.char<1>>
-  ! CHECK:   }
   ! CHECK:   %[[V_17:[0-9]+]] = fir.load %arg2 : !fir.ref<i32>
-  ! CHECK:   %[[V_18:[0-9]+]] = arith.cmpi eq, %[[V_17]], %c1{{.*}}_i32_4 : i32
+  ! CHECK:   %[[V_18:[0-9]+]] = arith.cmpi eq, %[[V_17]], %c1{{.*}} : i32
   ! CHECK:   cond_br %[[V_18]], ^bb2, ^bb3
   ! CHECK: ^bb2:  // pred: ^bb1
   ! CHECK:   br ^bb6
@@ -291,25 +281,15 @@ function f1(n1) result(res1)
   ! CHECK:   cf.br ^bb6
   ! CHECK: ^bb5:  // pred: ^bb3
   ! CHECK:   %[[V_21:[0-9]+]] = fir.address_of(@_QQcl.4320432043) : !fir.ref<!fir.char<1,5>>
-  ! CHECK:   %[[V_22:[0-9]+]] = arith.cmpi slt, %c5{{.*}}, %c5{{.*}} : index
-  ! CHECK:   %[[V_23:[0-9]+]] = arith.select %[[V_22]], %c5{{.*}}, %c5{{.*}} : index
-  ! CHECK:   %[[V_24:[0-9]+]] = fir.convert %[[V_23]] : (index) -> i64
-  ! CHECK:   %[[V_25:[0-9]+]] = arith.muli %c1{{.*}}_i64_6, %[[V_24]] : i64
-  ! CHECK:   %[[V_26:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+  ! CHECK:   %[[V_24:[0-9]+]] = fir.convert %c5{{.*}} : (index) -> i64
+  ! CHECK:   %[[V_25:[0-9]+]] = arith.muli %c1{{.*}}, %[[V_24]] : i64
+  ! CHECK:   %[[V_26:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
   ! CHECK:   %[[V_27:[0-9]+]] = fir.convert %[[V_21]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
   ! CHECK:   fir.call @llvm.memmove.p0.p0.i64(%[[V_26]], %[[V_27]], %[[V_25]], %false{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-  ! CHECK:   %[[V_28:[0-9]+]] = arith.subi %c5{{.*}}, %c1{{.*}} : index
-  ! CHECK:   %[[V_29:[0-9]+]] = fir.undefined !fir.char<1>
-  ! CHECK:   %[[V_30:[0-9]+]] = fir.insert_value %[[V_29]], %c32{{.*}}_i8_9, [0 : index] : (!fir.char<1>, i8) -> !fir.char<1>
-  ! CHECK:   fir.do_loop %arg3 = %[[V_23]] to %[[V_28]] step %c1{{.*}} {
-  ! CHECK:     %[[V_32]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1>>>
-  ! CHECK:     %[[V_33]] = fir.coordinate_of %[[V_32]], %arg3 : (!fir.ref<!fir.array<?x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
-  ! CHECK:     fir.store %[[V_30]] to %[[V_33]] : !fir.ref<!fir.char<1>>
-  ! CHECK:   }
   ! CHECK:   fir.call @_QFf1Ps3(%[[V_2]]) {{.*}}: (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>) -> ()
   ! CHECK:   br ^bb6
   ! CHECK: ^bb6:  //  3 preds: ^bb2, ^bb4, ^bb5
-  ! CHECK:   %[[V_31:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_31:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   return %[[V_31]] : !fir.boxchar<1>
   ! CHECK: }
   character(5) res1, f2, f3
@@ -317,15 +297,15 @@ function f1(n1) result(res1)
   if (n1 == 1) return
 
 ! CHECK-LABEL: @_QPf2
+! CHECK-SAME: %[[V_0:.*]]: !fir.ref<!fir.char<1,5>>
 entry f2(n2)
-  ! CHECK:   %[[V_0:[0-9]+]] = fir.convert %arg0 : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
   ! CHECK:   %[[V_1:[0-9]+]] = fir.alloca i32 {bindc_name = "n1", uniq_name = "_QFf1En1"}
   ! CHECK:   %[[V_2:[0-9]+]] = fir.alloca tuple<!fir.boxchar<1>, !fir.boxchar<1>>
   ! CHECK:   %[[V_3:[0-9]+]] = fir.coordinate_of %[[V_2]], %c0{{.*}}_i32 : (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
-  ! CHECK:   %[[V_4:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_4:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   fir.store %[[V_4]] to %[[V_3]] : !fir.ref<!fir.boxchar<1>>
   ! CHECK:   %[[V_5:[0-9]+]] = fir.coordinate_of %[[V_2]], %c1{{.*}}_i32 : (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
-  ! CHECK:   %[[V_6:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_6:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   fir.store %[[V_6]] to %[[V_5]] : !fir.ref<!fir.boxchar<1>>
   ! CHECK:   br ^bb1
   ! CHECK: ^bb1:  // pred: ^bb0
@@ -337,64 +317,44 @@ function f1(n1) result(res1)
   ! CHECK:   br ^bb4
   ! CHECK: ^bb3:  // pred: ^bb1
   ! CHECK:   %[[V_9:[0-9]+]] = fir.address_of(@_QQcl.4320432043) : !fir.ref<!fir.char<1,5>>
-  ! CHECK:   %[[V_10:[0-9]+]] = arith.cmpi slt, %c5{{.*}}, %c5{{.*}} : index
-  ! CHECK:   %[[V_11:[0-9]+]] = arith.select %[[V_10]], %c5{{.*}}, %c5{{.*}} : index
-  ! CHECK:   %[[V_12:[0-9]+]] = fir.convert %[[V_11]] : (index) -> i64
-  ! CHECK:   %[[V_13:[0-9]+]] = arith.muli %c1{{.*}}_i64, %[[V_12]] : i64
-  ! CHECK:   %[[V_14:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+  ! CHECK:   %[[V_12:[0-9]+]] = fir.convert %c5{{.*}} : (index) -> i64
+  ! CHECK:   %[[V_13:[0-9]+]] = arith.muli %c1{{.*}}, %[[V_12]] : i64
+  ! CHECK:   %[[V_14:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
   ! CHECK:   %[[V_15:[0-9]+]] = fir.convert %[[V_9]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
   ! CHECK:   fir.call @llvm.memmove.p0.p0.i64(%[[V_14]], %[[V_15]], %[[V_13]], %false{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-  ! CHECK:   %[[V_16:[0-9]+]] = arith.subi %c5{{.*}}, %c1{{.*}} : index
-  ! CHECK:   %[[V_17:[0-9]+]] = fir.undefined !fir.char<1>
-  ! CHECK:   %[[V_18:[0-9]+]] = fir.insert_value %[[V_17]], %c32{{.*}}_i8, [0 : index] : (!fir.char<1>, i8) -> !fir.char<1>
-  ! CHECK:   fir.do_loop %arg3 = %[[V_11]] to %[[V_16]] step %c1{{.*}} {
-  ! CHECK:     %[[V_20:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1>>>
-  ! CHECK:     %[[V_21:[0-9]+]] = fir.coordinate_of %[[V_20]], %arg3 : (!fir.ref<!fir.array<?x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
-  ! CHECK:     fir.store %[[V_18]] to %[[V_21]] : !fir.ref<!fir.char<1>>
-  ! CHECK:   }
   ! CHECK:   fir.call @_QFf1Ps3(%[[V_2]]) {{.*}}: (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>) -> ()
   ! CHECK:   br ^bb4
   ! CHECK: ^bb4:  // 2 preds: ^bb2, ^bb3
-  ! CHECK:   %[[V_19:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_19:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   return %[[V_19]] : !fir.boxchar<1>
   ! CHECK: }
   call s2
   if (n2 == 2) return
 
 ! CHECK-LABEL: @_QPf3
+! CHECK-SAME: %[[V_0:.*]]: !fir.ref<!fir.char<1,5>>
 entry f3
-  ! CHECK:   %[[V_0:[0-9]+]] = fir.convert %arg0 : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<!fir.char<1,?>>
   ! CHECK:   %[[V_1:[0-9]+]] = fir.alloca i32 {bindc_name = "n1", uniq_name = "_QFf1En1"}
   ! CHECK:   %[[V_2:[0-9]+]] = fir.alloca i32 {bindc_name = "n2", uniq_name = "_QFf1En2"}
   ! CHECK:   %[[V_3:[0-9]+]] = fir.alloca tuple<!fir.boxchar<1>, !fir.boxchar<1>>
   ! CHECK:   %[[V_4:[0-9]+]] = fir.coordinate_of %[[V_3]], %c0{{.*}}_i32 : (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
-  ! CHECK:   %[[V_5:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_5:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   fir.store %[[V_5]] to %[[V_4]] : !fir.ref<!fir.boxchar<1>>
   ! CHECK:   %[[V_6:[0-9]+]] = fir.coordinate_of %[[V_3]], %c1{{.*}}_i32 : (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
-  ! CHECK:   %[[V_7:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_7:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   fir.store %[[V_7]] to %[[V_6]] : !fir.ref<!fir.boxchar<1>>
   ! CHECK:   br ^bb1
   ! CHECK: ^bb1:  // pred: ^bb0
   ! CHECK:   %[[V_8:[0-9]+]] = fir.address_of(@_QQcl.4320432043) : !fir.ref<!fir.char<1,5>>
-  ! CHECK:   %[[V_9:[0-9]+]] = arith.cmpi slt, %c5{{.*}}, %c5{{.*}} : index
-  ! CHECK:   %[[V_10:[0-9]+]] = arith.select %[[V_9]], %c5{{.*}}, %c5{{.*}} : index
-  ! CHECK:   %[[V_11:[0-9]+]] = fir.convert %[[V_10]] : (index) -> i64
+  ! CHECK:   %[[V_11:[0-9]+]] = fir.convert %c5{{.*}} : (index) -> i64
   ! CHECK:   %[[V_12:[0-9]+]] = arith.muli %c1{{.*}}_i64, %[[V_11]] : i64
-  ! CHECK:   %[[V_13:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+  ! CHECK:   %[[V_13:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
   ! CHECK:   %[[V_14:[0-9]+]] = fir.convert %[[V_8]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
   ! CHECK:   fir.call @llvm.memmove.p0.p0.i64(%[[V_13]], %[[V_14]], %[[V_12]], %false{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-  ! CHECK:   %[[V_15:[0-9]+]] = arith.subi %c5{{.*}}, %c1{{.*}} : index
-  ! CHECK:   %[[V_16:[0-9]+]] = fir.undefined !fir.char<1>
-  ! CHECK:   %[[V_17:[0-9]+]] = fir.insert_value %[[V_16]], %c32{{.*}}_i8, [0 : index] : (!fir.char<1>, i8) -> !fir.char<1>
-  ! CHECK:   fir.do_loop %arg2 = %[[V_10]] to %[[V_15]] step %c1{{.*}} {
-  ! CHECK:     %[[V_19:[0-9]+]] = fir.convert %[[V_0]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1>>>
-  ! CHECK:     %[[V_20:[0-9]+]] = fir.coordinate_of %[[V_19]], %arg2 : (!fir.ref<!fir.array<?x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
-  ! CHECK:     fir.store %[[V_17]] to %[[V_20]] : !fir.ref<!fir.char<1>>
-  ! CHECK:   }
   ! CHECK:   fir.call @_QFf1Ps3(%[[V_3]]) {{.*}}: (!fir.ref<tuple<!fir.boxchar<1>, !fir.boxchar<1>>>) -> ()
   ! CHECK:   br ^bb2
   ! CHECK: ^bb2:  // pred: ^bb1
-  ! CHECK:   %[[V_18:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+  ! CHECK:   %[[V_18:[0-9]+]] = fir.emboxchar %[[V_0]], %c5{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
   ! CHECK:   return %[[V_18]] : !fir.boxchar<1>
   ! CHECK: }
   f3 = "C C C"
diff --git a/flang/test/Lower/equivalence-static-init.f90 b/flang/test/Lower/equivalence-static-init.f90
index 6b4b0ccff4c5e6..4c52a4a7d8448a 100644
--- a/flang/test/Lower/equivalence-static-init.f90
+++ b/flang/test/Lower/equivalence-static-init.f90
@@ -8,7 +8,7 @@ module module_without_init
   equivalence(i(1), x)
 end module
 ! CHECK-LABEL: fir.global @_QMmodule_without_initEi : !fir.array<8xi8> {
-  ! CHECK: %0 = fir.undefined !fir.array<8xi8>
+  ! CHECK: %0 = fir.zero_bits !fir.array<8xi8>
   ! CHECK: fir.has_value %0 : !fir.array<8xi8>
 ! CHECK}
 
diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90
index 7fe9cc394860d0..047221b8350228 100644
--- a/flang/test/Lower/host-associated.f90
+++ b/flang/test/Lower/host-associated.f90
@@ -486,8 +486,7 @@ end subroutine test_proc_dummy_other
 ! CHECK:         %[[VAL_12:.*]] = fir.alloca !fir.char<1,10> {bindc_name = "message", uniq_name = "_QFtest_proc_dummy_charEmessage"}
 ! CHECK:         %[[VAL_13:.*]] = fir.alloca tuple<!fir.boxchar<1>>
 ! CHECK:         %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_13]], %[[VAL_1]] : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
-! CHECK:         %[[VAL_15:.*]] = fir.convert %[[VAL_12]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:         %[[VAL_16:.*]] = fir.emboxchar %[[VAL_15]], %[[VAL_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:         %[[VAL_16:.*]] = fir.emboxchar %[[VAL_12]], %[[VAL_0]] : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
 ! CHECK:         fir.store %[[VAL_16]] to %[[VAL_14]] : !fir.ref<!fir.boxchar<1>>
 ! CHECK:         %[[VAL_17:.*]] = fir.address_of(@_QQcl.{{.*}}) : !fir.ref<!fir.char<1,9>>
 ! CHECK:         %[[VAL_18:.*]] = fir.convert %[[VAL_2]] : (index) -> i64
@@ -536,14 +535,13 @@ end subroutine test_proc_dummy_other
 ! CHECK-DAG:         %[[VAL_6:.*]] = arith.constant 1 : index
 ! CHECK-DAG:         %[[VAL_7:.*]] = arith.constant 32 : i8
 ! CHECK-DAG:         %[[VAL_8:.*]] = arith.constant 0 : index
-! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:         %[[VAL_9:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_3]] : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
 ! CHECK:         %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.boxchar<1>>
 ! CHECK:         %[[VAL_11:.*]]:2 = fir.unboxchar %[[VAL_10]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:         %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]]#1, %[[VAL_4]] : index
 ! CHECK:         %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_4]], %[[VAL_11]]#1 : index
 ! CHECK:         %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (index) -> i64
-! CHECK:         %[[VAL_16:.*]] = fir.convert %[[VAL_12]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK:         %[[VAL_16:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<i8>
 ! CHECK:         %[[VAL_17:.*]] = fir.convert %[[VAL_11]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
 ! CHECK:         fir.call @llvm.memmove.p0.p0.i64(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]], %[[VAL_5]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
 ! CHECK:         %[[VAL_18:.*]] = fir.undefined !fir.char<1>
@@ -554,14 +552,14 @@ end subroutine test_proc_dummy_other
 ! CHECK:         %[[VAL_23:.*]] = arith.cmpi sgt, %[[VAL_22]], %[[VAL_8]] : index
 ! CHECK:         cond_br %[[VAL_23]], ^bb2, ^bb3
 ! CHECK:       ^bb2:
-! CHECK:         %[[VAL_24:.*]] = fir.convert %[[VAL_12]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1>>>
-! CHECK:         %[[VAL_25:.*]] = fir.coordinate_of %[[VAL_24]], %[[VAL_21]] : (!fir.ref<!fir.array<?x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
+! CHECK:         %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.array<10x!fir.char<1>>>
+! CHECK:         %[[VAL_25:.*]] = fir.coordinate_of %[[VAL_24]], %[[VAL_21]] : (!fir.ref<!fir.array<10x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
 ! CHECK:         fir.store %[[VAL_19]] to %[[VAL_25]] : !fir.ref<!fir.char<1>>
 ! CHECK:         %[[VAL_26:.*]] = arith.addi %[[VAL_21]], %[[VAL_6]] : index
 ! CHECK:         %[[VAL_27:.*]] = arith.subi %[[VAL_22]], %[[VAL_6]] : index
 ! CHECK:         br ^bb1(%[[VAL_26]], %[[VAL_27]] : index, index)
 ! CHECK:       ^bb3:
-! CHECK:         %[[VAL_28:.*]] = fir.emboxchar %[[VAL_12]], %[[VAL_4]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:         %[[VAL_28:.*]] = fir.emboxchar %[[VAL_0]], %[[VAL_4]] : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
 ! CHECK:         return %[[VAL_28]] : !fir.boxchar<1>
 ! CHECK:       }
 
@@ -575,7 +573,6 @@ end subroutine test_proc_dummy_other
 ! CHECK-DAG:     %[[VAL_6:.*]] = arith.constant 1 : index
 ! CHECK-DAG:     %[[VAL_7:.*]] = arith.constant 32 : i8
 ! CHECK-DAG:     %[[VAL_8:.*]] = arith.constant 0 : index
-! CHECK:         %[[VAL_9:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1,40>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:         %[[VAL_10:.*]] = fir.address_of(@_QQcl.{{.*}}) : !fir.ref<!fir.char<1,12>>
 ! CHECK:         %[[VAL_11:.*]] = fir.extract_value %[[VAL_2]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>) -> !fir.boxproc<() -> ()>
 ! CHECK:         %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.boxproc<() -> ()>) -> (() -> ())
@@ -610,7 +607,7 @@ end subroutine test_proc_dummy_other
 ! CHECK:         %[[VAL_35:.*]] = arith.cmpi sgt, %[[VAL_19]], %[[VAL_3]] : index
 ! CHECK:         %[[VAL_36:.*]] = arith.select %[[VAL_35]], %[[VAL_3]], %[[VAL_19]] : index
 ! CHECK:         %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (index) -> i64
-! CHECK:         %[[VAL_38:.*]] = fir.convert %[[VAL_9]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK:         %[[VAL_38:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1,40>>) -> !fir.ref<i8>
 ! CHECK:         fir.call @llvm.memmove.p0.p0.i64(%[[VAL_38]], %[[VAL_22]], %[[VAL_37]], %[[VAL_5]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
 ! CHECK:         %[[VAL_39:.*]] = fir.undefined !fir.char<1>
 ! CHECK:         %[[VAL_40:.*]] = fir.insert_value %[[VAL_39]], %[[VAL_7]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1>
@@ -620,15 +617,15 @@ end subroutine test_proc_dummy_other
 ! CHECK:         %[[VAL_44:.*]] = arith.cmpi sgt, %[[VAL_43]], %[[VAL_8]] : index
 ! CHECK:         cond_br %[[VAL_44]], ^bb5, ^bb6
 ! CHECK:       ^bb5:
-! CHECK:         %[[VAL_45:.*]] = fir.convert %[[VAL_9]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1>>>
-! CHECK:         %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_45]], %[[VAL_42]] : (!fir.ref<!fir.array<?x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
+! CHECK:         %[[VAL_45:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.char<1,40>>) -> !fir.ref<!fir.array<40x!fir.char<1>>>
+! CHECK:         %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_45]], %[[VAL_42]] : (!fir.ref<!fir.array<40x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
 ! CHECK:         fir.store %[[VAL_40]] to %[[VAL_46]] : !fir.ref<!fir.char<1>>
 ! CHECK:         %[[VAL_47:.*]] = arith.addi %[[VAL_42]], %[[VAL_6]] : index
 ! CHECK:         %[[VAL_48:.*]] = arith.subi %[[VAL_43]], %[[VAL_6]] : index
 ! CHECK:         br ^bb4(%[[VAL_47]], %[[VAL_48]] : index, index)
 ! CHECK:       ^bb6:
 ! CHECK:         fir.call @llvm.stackrestore.p0(%[[VAL_14]]) {{.*}}: (!fir.ref<i8>) -> ()
-! CHECK:         %[[VAL_49:.*]] = fir.emboxchar %[[VAL_9]], %[[VAL_3]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:         %[[VAL_49:.*]] = fir.emboxchar %[[VAL_0]], %[[VAL_3]] : (!fir.ref<!fir.char<1,40>>, index) -> !fir.boxchar<1>
 ! CHECK:         return %[[VAL_49]] : !fir.boxchar<1>
 ! CHECK:       }
 
diff --git a/flang/test/Lower/implicit-call-mismatch.f90 b/flang/test/Lower/implicit-call-mismatch.f90
index 37d717f8f890b0..5d38bd6cc06268 100644
--- a/flang/test/Lower/implicit-call-mismatch.f90
+++ b/flang/test/Lower/implicit-call-mismatch.f90
@@ -234,8 +234,8 @@ subroutine pass_char_to_char_proc(a)
 ! CHECK-LABEL: func.func @_QPpass_char_to_char_proc(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.boxchar<1>
 ! CHECK: %[[charRefAndLen:.*]]:2 = fir.unboxchar %[[arg0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK: %[[charLen:.*]] = arith.constant 8 : index
 ! CHECK: %[[charRef:.*]] = fir.convert %[[charRefAndLen]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,8>>
+! CHECK: %[[charLen:.*]] = arith.constant 8 : index
 ! CHECK: %[[procAddr:.*]] = fir.convert %[[charRef]] : (!fir.ref<!fir.char<1,8>>) -> (() -> ())
 ! CHECK: %[[boxProc:.*]] = fir.emboxproc %[[procAddr]] : (() -> ()) -> !fir.boxproc<() -> ()>
 ! CHECK: %[[charLen2:.*]] = fir.convert %[[charLen]] : (index) -> i64
diff --git a/flang/test/Lower/pointer-args-caller.f90 b/flang/test/Lower/pointer-args-caller.f90
index 8f05cbbc43bbab..1aee08cb8b8982 100644
--- a/flang/test/Lower/pointer-args-caller.f90
+++ b/flang/test/Lower/pointer-args-caller.f90
@@ -102,8 +102,8 @@ subroutine test_non_ptr_to_char_array_ptr(p)
   character(10), target :: p(10)
 ! CHECK:  %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>
 ! CHECK:  %[[VAL_2:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_4:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,10>>>
+! CHECK:  %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.array<10x!fir.char<1,10>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
diff --git a/flang/test/Lower/pointer-default-init.f90 b/flang/test/Lower/pointer-default-init.f90
index 4697d601347e74..6dde5d56aaa822 100644
--- a/flang/test/Lower/pointer-default-init.f90
+++ b/flang/test/Lower/pointer-default-init.f90
@@ -21,7 +21,7 @@ module test
   type(t) :: test_module_var
 ! CHECK-LABEL:   fir.global @_QMtestEtest_module_var : !fir.type<_QMtestTt{i:i32,x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}> {
 ! CHECK:  %[[VAL_0:.*]] = fir.undefined !fir.type<_QMtestTt{i:i32,x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>
-! CHECK:  %[[VAL_1:.*]] = fir.undefined i32
+! CHECK:  %[[VAL_1:.*]] = fir.zero_bits i32
 ! CHECK:  %[[VAL_2:.*]] = fir.field_index i
 ! CHECK:  %[[VAL_3:.*]] = fir.insert_value %[[VAL_0]], %[[VAL_1]]
 ! CHECK:  %[[VAL_4:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xf32>>
@@ -97,7 +97,7 @@ subroutine test_saved_pointer()
 
 ! CHECK-LABEL:   fir.global internal @_QFtest_savedEx : !fir.type<_QMtestTt{i:i32,x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}> {
 ! CHECK:  %[[VAL_0:.*]] = fir.undefined !fir.type<_QMtestTt{i:i32,x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>
-! CHECK:  %[[VAL_1:.*]] = fir.undefined i32
+! CHECK:  %[[VAL_1:.*]] = fir.zero_bits i32
 ! CHECK:  %[[VAL_2:.*]] = fir.field_index i
 ! CHECK:  %[[VAL_3:.*]] = fir.insert_value %[[VAL_0]], %[[VAL_1]]
 ! CHECK:  %[[VAL_4:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xf32>>
diff --git a/flang/test/Lower/pointer-initial-target-2.f90 b/flang/test/Lower/pointer-initial-target-2.f90
index 69e9f23126708f..a3f2292e7fb767 100644
--- a/flang/test/Lower/pointer-initial-target-2.f90
+++ b/flang/test/Lower/pointer-initial-target-2.f90
@@ -12,7 +12,7 @@
   common /a/ p
   data p /b/
 ! CHECK-LABEL: fir.global @a_ : tuple<!fir.box<!fir.ptr<f32>>>
-  ! CHECK: %[[undef:.*]] = fir.undefined tuple<!fir.box<!fir.ptr<f32>>>
+  ! CHECK: %[[undef:.*]] = fir.zero_bits tuple<!fir.box<!fir.ptr<f32>>>
   ! CHECK: %[[b:.*]] = fir.address_of(@_QEb) : !fir.ref<f32>
   ! CHECK: %[[box:.*]] = fir.embox %[[b]] : (!fir.ref<f32>) -> !fir.box<f32>
   ! CHECK: %[[rebox:.*]] = fir.rebox %[[box]] : (!fir.box<f32>) -> !fir.box<!fir.ptr<f32>>
@@ -41,7 +41,7 @@ block data bdsnake
   integer, pointer :: p => b
   common /snake/ p, b
 ! CHECK-LABEL: fir.global @snake_ : tuple<!fir.box<!fir.ptr<i32>>, i32>
-  ! CHECK: %[[tuple0:.*]] = fir.undefined tuple<!fir.box<!fir.ptr<i32>>, i32>
+  ! CHECK: %[[tuple0:.*]] = fir.zero_bits tuple<!fir.box<!fir.ptr<i32>>, i32>
   ! CHECK: %[[snakeAddr:.*]] = fir.address_of(@snake_) : !fir.ref<tuple<!fir.box<!fir.ptr<i32>>, i32>>
   ! CHECK: %[[byteView:.*]] = fir.convert %[[snakeAddr:.*]] : (!fir.ref<tuple<!fir.box<!fir.ptr<i32>>, i32>>) -> !fir.ref<!fir.array<?xi8>>
   ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[byteView]], %c24{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
diff --git a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
index 3f8fe4953c559f..5341fb2e7d2885 100644
--- a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
+++ b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
@@ -19,7 +19,7 @@ program sample
         y = y + 10
     
     !$omp atomic update hint(5)
-        y = x
+        y = x + y
     
     !ERROR: Hint clause value is not a valid OpenMP synchronization value
     !$omp atomic hint(7) capture
diff --git a/flang/test/Semantics/OpenMP/atomic01.f90 b/flang/test/Semantics/OpenMP/atomic01.f90
index 79f69e5843e82f..5469f82a0083ca 100644
--- a/flang/test/Semantics/OpenMP/atomic01.f90
+++ b/flang/test/Semantics/OpenMP/atomic01.f90
@@ -55,40 +55,49 @@
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive
   !$omp atomic seq_cst seq_cst update
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive
   !$omp atomic update seq_cst seq_cst
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive
   !$omp atomic seq_cst update seq_cst
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one RELEASE clause can appear on the UPDATE directive
   !$omp atomic release release update
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one RELEASE clause can appear on the UPDATE directive
   !$omp atomic update release release
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one RELEASE clause can appear on the UPDATE directive
   !$omp atomic release update release
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one RELAXED clause can appear on the UPDATE directive
   !$omp atomic relaxed relaxed update
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one RELAXED clause can appear on the UPDATE directive
   !$omp atomic update relaxed relaxed
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one RELAXED clause can appear on the UPDATE directive
   !$omp atomic relaxed update relaxed
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
 !CAPTURE
@@ -240,14 +249,17 @@
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed relaxed
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst seq_cst
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
   !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic release release
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
 ! 2.17.7.3
@@ -282,21 +294,27 @@
     i = j
   !ERROR: At most one HINT clause can appear on the UPDATE directive
   !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) update
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: At most one HINT clause can appear on the UPDATE directive
   !$omp atomic hint(omp_sync_hint_nonspeculative) update hint(omp_sync_hint_nonspeculative)
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: At most one HINT clause can appear on the UPDATE directive
   !$omp atomic update hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended)
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative)
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_none) hint(omp_sync_hint_nonspeculative)
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended)
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
   !ERROR: At most one HINT clause can appear on the CAPTURE directive
@@ -354,25 +372,31 @@
 
   !ERROR: Clause ACQ_REL is not allowed if clause UPDATE appears on the ATOMIC directive
   !$omp atomic acq_rel update
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
   !ERROR: Clause ACQ_REL is not allowed if clause UPDATE appears on the ATOMIC directive
   !$omp atomic update acq_rel
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
   !ERROR: Clause ACQUIRE is not allowed if clause UPDATE appears on the ATOMIC directive
   !$omp atomic acquire update
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
   !ERROR: Clause ACQUIRE is not allowed if clause UPDATE appears on the ATOMIC directive
   !$omp atomic update acquire
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
   !ERROR: Clause ACQ_REL is not allowed on the ATOMIC directive
   !$omp atomic acq_rel
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 
   !ERROR: Clause ACQUIRE is not allowed on the ATOMIC directive
   !$omp atomic acquire
+  !ERROR: Invalid or missing operator in atomic update statement
     i = j
 end program
 
diff --git a/flang/test/Semantics/OpenMP/atomic02.f90 b/flang/test/Semantics/OpenMP/atomic02.f90
index 241c27a31dd942..c11622c87cbd35 100644
--- a/flang/test/Semantics/OpenMP/atomic02.f90
+++ b/flang/test/Semantics/OpenMP/atomic02.f90
@@ -26,34 +26,34 @@ program OmpAtomic
    !$omp atomic
    a = a/(b + 1)
    !$omp atomic
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    a = a**4
    !$omp atomic
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    c = c//d
    !$omp atomic
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .LT. b
    !$omp atomic
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .LE. b
    !$omp atomic
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .EQ. b
    !$omp atomic
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .NE. b
    !$omp atomic
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .GE. b
    !$omp atomic
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .GT. b
    !$omp atomic
    m = m .AND. n
@@ -72,30 +72,30 @@ program OmpAtomic
    !$omp atomic update
    a = a/(b + 1)
    !$omp atomic update
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    a = a**4
    !$omp atomic update
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    c = c//d
    !$omp atomic update
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .LT. b
    !$omp atomic update
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .LE. b
    !$omp atomic update
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .EQ. b
    !$omp atomic update
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .GE. b
    !$omp atomic update
    !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid operator in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Invalid or missing operator in atomic update statement
    l = a .GT. b
    !$omp atomic update
    m = m .AND. n
diff --git a/flang/test/Semantics/OpenMP/atomic03.f90 b/flang/test/Semantics/OpenMP/atomic03.f90
index f829e78426e5a4..5fc642088ca56f 100644
--- a/flang/test/Semantics/OpenMP/atomic03.f90
+++ b/flang/test/Semantics/OpenMP/atomic03.f90
@@ -23,26 +23,28 @@ program OmpAtomic
    y = MIN(y, 8)
 
 !$omp atomic
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = IAND(y, 4)
 !$omp atomic
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = IOR(y, 5)
 !$omp atomic
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = IEOR(y, 6)
 !$omp atomic
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = MAX(y, 7, b, c)
 !$omp atomic
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = MIN(y, 8, a, d)
 
 !$omp atomic
    !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y'
    y = FRACTION(x)
 !$omp atomic
    !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y'
    y = REAL(x)
 !$omp atomic update
    y = IAND(y, 4)
@@ -56,19 +58,19 @@ program OmpAtomic
    y = MIN(y, 8)
 
 !$omp atomic update
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = IAND(y, 4)
-!$omp atomic update
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+!$omp atomic update 
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = IOR(y, 5)
 !$omp atomic update
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = IEOR(y, 6)
 !$omp atomic update
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = MAX(y, 7)
 !$omp atomic update
-   !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
    z = MIN(y, 8)
 
 !$omp atomic update
@@ -88,6 +90,53 @@ subroutine conflicting_types()
     type(simple) ::s
     z = 1
     !$omp atomic
-    !ERROR: Atomic update variable 'z' not found in the argument list of intrinsic procedure
+    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
     z = IAND(s%z, 4)
 end subroutine
+
+subroutine more_invalid_atomic_update_stmts()
+    integer :: a, b
+    integer :: k(10)
+    type some_type
+        integer :: m(10)
+    end type
+    type(some_type) :: s
+ 
+    !$omp atomic update
+    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a'
+        a = min(a, a, b)
+     
+    !$omp atomic
+    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a'
+        a = max(b, a, b, a)
+
+    !$omp atomic
+    !ERROR: Atomic update statement should be of the form `a = intrinsic_procedure(a, expr_list)` OR `a = intrinsic_procedure(expr_list, a)`
+        a = min(b, a, b)
+
+    !$omp atomic
+    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a'
+        a = max(b, a, b, a, b)
+    
+    !$omp atomic update
+    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y'
+        y = min(z, x)
+     
+    !$omp atomic
+        z = max(z, y)
+
+    !$omp atomic update
+    !ERROR: Expected scalar variable on the LHS of atomic update assignment statement
+    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'k'
+        k = max(x, y)
+    
+    !$omp atomic
+    !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
+    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
+        x = min(x, k)
+
+    !$omp atomic
+    !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
+    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
+        z =z + s%m
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/atomic04.f90 b/flang/test/Semantics/OpenMP/atomic04.f90
index 9ac8ddee884615..ddf6b8bc2f5f6f 100644
--- a/flang/test/Semantics/OpenMP/atomic04.f90
+++ b/flang/test/Semantics/OpenMP/atomic04.f90
@@ -19,21 +19,24 @@ program OmpAtomic
    x = 1 + x
 !$omp atomic
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = y + 1
 !$omp atomic
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   x = 1 + (y + x)
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   x = 1 + y
 
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   x = 1 - (10 * (y + x))
+   x = x - 1
 !$omp atomic
    x = 1 - x
 !$omp atomic
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = y - 1
 !$omp atomic
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = 1 - y
 
 !$omp atomic
@@ -42,10 +45,12 @@ program OmpAtomic
    x = 1*x
 !$omp atomic
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   x = y*(10 + x)
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   x = y*1
 !$omp atomic
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   x = (44 * x) * y
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   x = 1*y
 
 !$omp atomic
    x = x/1
@@ -53,25 +58,29 @@ program OmpAtomic
    x = 1/x
 !$omp atomic
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = y/1
 !$omp atomic
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = 1/y
 
 !$omp atomic
    m = m .AND. n
 !$omp atomic
    m = n .AND. m
-!$omp atomic
+!$omp atomic 
    !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
+   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
    m = n .AND. l
 
 !$omp atomic
    m = m .OR. n
 !$omp atomic
    m = n .OR. m
-!$omp atomic
+!$omp atomic 
    !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
+   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
    m = n .OR. l
 
 !$omp atomic
@@ -80,6 +89,7 @@ program OmpAtomic
    m = n .EQV. m
 !$omp atomic
    !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
+   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
    m = n .EQV. l
 
 !$omp atomic
@@ -88,6 +98,7 @@ program OmpAtomic
    m = n .NEQV. m
 !$omp atomic
    !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
+   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
    m = n .NEQV. l
 
 !$omp atomic update
@@ -96,9 +107,11 @@ program OmpAtomic
    x = 1 + x
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = y + 1
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = 1 + y
 
 !$omp atomic update
@@ -107,9 +120,11 @@ program OmpAtomic
    x = 1 - x
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = y - 1
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = 1 - y
 
 !$omp atomic update
@@ -118,9 +133,11 @@ program OmpAtomic
    x = 1*x
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = y*1
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
    x = 1*y
 
 !$omp atomic update
@@ -129,10 +146,12 @@ program OmpAtomic
    x = 1/x
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   x = max(x, y) + 10
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   x = y/1
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   x = y * min(x, y)
+   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   x = 1/y
 
 !$omp atomic update
    m = m .AND. n
@@ -140,6 +159,7 @@ program OmpAtomic
    m = n .AND. m
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
+   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
    m = n .AND. l
 
 !$omp atomic update
@@ -148,6 +168,7 @@ program OmpAtomic
    m = n .OR. m
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
+   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
    m = n .OR. l
 
 !$omp atomic update
@@ -156,6 +177,7 @@ program OmpAtomic
    m = n .EQV. m
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
+   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
    m = n .EQV. l
 
 !$omp atomic update
@@ -164,6 +186,78 @@ program OmpAtomic
    m = n .NEQV. m
 !$omp atomic update
    !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
+   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
    m = n .NEQV. l
 
 end program OmpAtomic
+
+subroutine more_invalid_atomic_update_stmts()
+    integer :: a, b, c
+    integer :: d(10)
+    real :: x, y, z(10)
+    type some_type
+        real :: m
+        real :: n(10)
+    end type
+    type(some_type) p
+    
+    !$omp atomic
+    !ERROR: Invalid or missing operator in atomic update statement
+        x = x
+
+    !$omp atomic update
+    !ERROR: Invalid or missing operator in atomic update statement
+        x = 1    
+
+    !$omp atomic update
+    !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement
+        a = a * b + a
+
+    !$omp atomic
+    !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a`
+        a = b * (a + 9)
+
+    !$omp atomic update
+    !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement
+        a = a * (a + b)
+
+    !$omp atomic
+    !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement
+        a = (b + a) * a
+
+    !$omp atomic
+    !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a`
+        a = a * b + c
+
+    !$omp atomic update
+    !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a`
+        a = a + b + c
+
+    !$omp atomic
+        a = b * c + a
+
+    !$omp atomic update
+        a = c + b + a
+
+    !$omp atomic
+    !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
+    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
+        a = a + d
+
+    !$omp atomic update
+    !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
+    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
+        x = x * y / z
+
+    !$omp atomic
+    !ERROR: Atomic update statement should be of form `p%m = p%m operator expr` OR `p%m = expr operator p%m`
+    !ERROR: Exactly one occurence of 'p%m' expected on the RHS of atomic update assignment statement
+        p%m = x + y
+
+    !$omp atomic update
+    !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
+    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
+    !ERROR: Exactly one occurence of 'p%m' expected on the RHS of atomic update assignment statement
+        p%m = p%m + p%n
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/atomic05.f90 b/flang/test/Semantics/OpenMP/atomic05.f90
index 70492675e61e18..b3ff6e9b910f4f 100644
--- a/flang/test/Semantics/OpenMP/atomic05.f90
+++ b/flang/test/Semantics/OpenMP/atomic05.f90
@@ -17,6 +17,7 @@ program OmpAtomic
         x = 2 * 4
     !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
     !$omp atomic update release, seq_cst
+    !ERROR: Invalid or missing operator in atomic update statement
         x = 10
     !ERROR: More than one memory order clause not allowed on OpenMP Atomic construct
     !$omp atomic capture release, seq_cst
diff --git a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90 b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
index 3ba54a32638b51..7e2d508839ff60 100644
--- a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
+++ b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
@@ -36,10 +36,9 @@ program sample
     !ERROR: k must not have ALLOCATABLE attribute
         k = x
 
-    !$omp atomic update 
-    !ERROR: Atomic update statement should be of form `k = k operator expr` OR `k = expr operator k`
+    !$omp atomic update
     !ERROR: k must not have ALLOCATABLE attribute
-        k = v + k * (v * k)
+        k = k + x * (v * x)
 
     !$omp atomic
     !ERROR: k must not have ALLOCATABLE attribute
diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py
index 89a2353047c305..dda8ed456c986a 100644
--- a/flang/test/lit.cfg.py
+++ b/flang/test/lit.cfg.py
@@ -153,16 +153,19 @@
 # the C++ runtime libraries. For this we need a C compiler. If for some reason
 # we don't have one, we can just disable the test.
 if config.cc:
-    libruntime = os.path.join(config.flang_lib_dir, "libflang-rt.a")
+    libruntime = os.path.join(config.flang_lib_dir, "libFortranRuntime.a")
+    libdecimal = os.path.join(config.flang_lib_dir, "libFortranDecimal.a")
     include = os.path.join(config.flang_src_dir, "include")
 
     if (
         os.path.isfile(libruntime)
+        and os.path.isfile(libdecimal)
         and os.path.isdir(include)
     ):
         config.available_features.add("c-compiler")
         tools.append(ToolSubst("%cc", command=config.cc, unresolved="fatal"))
         tools.append(ToolSubst("%libruntime", command=libruntime, unresolved="fatal"))
+        tools.append(ToolSubst("%libdecimal", command=libdecimal, unresolved="fatal"))
         tools.append(ToolSubst("%include", command=include, unresolved="fatal"))
 
 # Add all the tools and their substitutions (if applicable). Use the search paths provided for
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index ddac27dcdb95c5..6752d2cf53532c 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -368,7 +368,8 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
     pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
 
     // Add O2 optimizer pass pipeline.
-    fir::createDefaultFIROptimizerPassPipeline(pm, llvm::OptimizationLevel::O2);
+    fir::createDefaultFIROptimizerPassPipeline(
+        pm, MLIRToLLVMPassPipelineConfig(llvm::OptimizationLevel::O2));
   }
 
   if (mlir::succeeded(pm.run(mlirModule))) {
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index 9f33cdfe3fa90f..3ce8b407450d24 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -14,6 +14,14 @@ set( LLVM_LINK_COMPONENTS
 add_flang_tool(flang-new
   driver.cpp
   fc1_main.cpp
+
+  DEPENDS
+  # These libraries are used in the linker invocation generated by the driver
+  # (i.e. when constructing the linker job). Without them the driver would be
+  # unable to generate executables.
+  FortranRuntime
+  FortranDecimal
+  Fortran_main
 )
 
 target_link_libraries(flang-new
diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp
index 9552d5c9af9e30..31d6bac142dc42 100644
--- a/flang/tools/tco/tco.cpp
+++ b/flang/tools/tco/tco.cpp
@@ -17,6 +17,7 @@
 #include "flang/Optimizer/Support/InitFIR.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Transforms/Passes.h"
+#include "flang/Tools/CrossToolHelpers.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -118,12 +119,13 @@ compileFIR(const mlir::PassPipelineCLParser &passPipeline) {
     if (mlir::failed(passPipeline.addToPipeline(pm, errorHandler)))
       return mlir::failure();
   } else {
+    MLIRToLLVMPassPipelineConfig config(llvm::OptimizationLevel::O2);
     if (codeGenLLVM) {
       // Run only CodeGen passes.
-      fir::createDefaultFIRCodeGenPassPipeline(pm);
+      fir::createDefaultFIRCodeGenPassPipeline(pm, config);
     } else {
       // Run tco with O2 by default.
-      fir::createMLIRToLLVMPassPipeline(pm, llvm::OptimizationLevel::O2);
+      fir::createMLIRToLLVMPassPipeline(pm, config);
     }
     fir::addLLVMDialectToLLVMPass(pm, out.os());
   }
diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt
index ab12fd8e7f5daa..72d37ebeb853c5 100644
--- a/flang/unittests/CMakeLists.txt
+++ b/flang/unittests/CMakeLists.txt
@@ -24,15 +24,11 @@ function(add_flang_unittest_offload_properties target)
   # FIXME: replace 'native' in --offload-arch option with the list
   #        of targets that Fortran Runtime was built for.
   #        Common code must be moved from flang/runtime/CMakeLists.txt.
-  # TODO: Revisit this because of Flang-rt. runtime is no longer an added subdirectory of flang. So we temporarily duplicated the option definition to here. This is not a permanent solution.
-  set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
-    "Compile Fortran runtime as OpenMP target offload sources (experimental). Valid options are 'off', 'host_device', 'nohost'")
-
   if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "off")
     set_target_properties(${target}
-        PROPERTIES LINK_OPTIONS
-        "-fopenmp;--offload-arch=native"
-    )
+      PROPERTIES LINK_OPTIONS
+      "-fopenmp;--offload-arch=native"
+      )
   endif()
 endfunction()
 
@@ -78,4 +74,5 @@ add_subdirectory(Optimizer)
 add_subdirectory(Common)
 add_subdirectory(Decimal)
 add_subdirectory(Evaluate)
+add_subdirectory(Runtime)
 add_subdirectory(Frontend)
diff --git a/flang/unittests/Evaluate/CMakeLists.txt b/flang/unittests/Evaluate/CMakeLists.txt
index f9551dac713a34..4658d8d3345b5d 100644
--- a/flang/unittests/Evaluate/CMakeLists.txt
+++ b/flang/unittests/Evaluate/CMakeLists.txt
@@ -45,6 +45,7 @@ add_flang_nongtest_unittest(intrinsics
   FortranDecimal
   FortranSemantics
   FortranParser
+  FortranRuntime
 )
 
 add_flang_nongtest_unittest(logical
@@ -67,6 +68,20 @@ add_flang_nongtest_unittest(real
 )
 llvm_update_compile_flags(real.test)
 
+add_flang_nongtest_unittest(reshape
+  FortranEvaluateTesting
+  FortranSemantics
+  FortranEvaluate
+  FortranRuntime
+)
+
+add_flang_nongtest_unittest(ISO-Fortran-binding
+  FortranEvaluateTesting
+  FortranEvaluate
+  FortranSemantics
+  FortranRuntime
+)
+
 add_flang_nongtest_unittest(folding
   FortranCommon
   FortranEvaluateTesting
diff --git a/flang-rt/unittests/FortranEvaluate/ISO-Fortran-binding.cpp b/flang/unittests/Evaluate/ISO-Fortran-binding.cpp
similarity index 100%
rename from flang-rt/unittests/FortranEvaluate/ISO-Fortran-binding.cpp
rename to flang/unittests/Evaluate/ISO-Fortran-binding.cpp
diff --git a/flang-rt/unittests/FortranEvaluate/reshape.cpp b/flang/unittests/Evaluate/reshape.cpp
similarity index 100%
rename from flang-rt/unittests/FortranEvaluate/reshape.cpp
rename to flang/unittests/Evaluate/reshape.cpp
diff --git a/flang/unittests/Optimizer/CMakeLists.txt b/flang/unittests/Optimizer/CMakeLists.txt
index 34b45d0e77822c..9c165d998e2e11 100644
--- a/flang/unittests/Optimizer/CMakeLists.txt
+++ b/flang/unittests/Optimizer/CMakeLists.txt
@@ -19,6 +19,16 @@ add_flang_unittest(FlangOptimizerTests
   Builder/DoLoopHelperTest.cpp
   Builder/FIRBuilderTest.cpp
   Builder/HLFIRToolsTest.cpp
+  Builder/Runtime/AllocatableTest.cpp
+  Builder/Runtime/AssignTest.cpp
+  Builder/Runtime/CommandTest.cpp
+  Builder/Runtime/CharacterTest.cpp
+  Builder/Runtime/DerivedTest.cpp
+  Builder/Runtime/NumericTest.cpp
+  Builder/Runtime/RaggedTest.cpp
+  Builder/Runtime/ReductionTest.cpp
+  Builder/Runtime/StopTest.cpp
+  Builder/Runtime/TransformationalTest.cpp
   FIRContextTest.cpp
   FIRTypesTest.cpp
   FortranVariableTest.cpp
diff --git a/flang/unittests/Optimizer/FIRTypesTest.cpp b/flang/unittests/Optimizer/FIRTypesTest.cpp
index 2bde6f556bdc82..b66c1ff441bade 100644
--- a/flang/unittests/Optimizer/FIRTypesTest.cpp
+++ b/flang/unittests/Optimizer/FIRTypesTest.cpp
@@ -308,7 +308,7 @@ TEST_F(FIRTypesTest, getTypeAsString) {
       fir::SequenceType::get({fir::SequenceType::getUnknownExtent(),
                                  fir::SequenceType::getUnknownExtent()},
           ty);
-  EXPECT_EQ("?x?xi64", fir::getTypeAsString(dynArrTy, *kindMap));
+  EXPECT_EQ("UxUxi64", fir::getTypeAsString(dynArrTy, *kindMap));
   EXPECT_EQ("llvmptr_i32",
       fir::getTypeAsString(
           fir::LLVMPointerType::get(mlir::IntegerType::get(&context, 32)),
diff --git a/flang-rt/unittests/FortranRuntime/Allocatable.cpp b/flang/unittests/Runtime/Allocatable.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Allocatable.cpp
rename to flang/unittests/Runtime/Allocatable.cpp
diff --git a/flang-rt/unittests/FortranRuntime/ArrayConstructor.cpp b/flang/unittests/Runtime/ArrayConstructor.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/ArrayConstructor.cpp
rename to flang/unittests/Runtime/ArrayConstructor.cpp
diff --git a/flang-rt/unittests/FortranRuntime/BufferTest.cpp b/flang/unittests/Runtime/BufferTest.cpp
similarity index 82%
rename from flang-rt/unittests/FortranRuntime/BufferTest.cpp
rename to flang/unittests/Runtime/BufferTest.cpp
index ce905b254a27e8..0632324b25d22e 100644
--- a/flang-rt/unittests/FortranRuntime/BufferTest.cpp
+++ b/flang/unittests/Runtime/BufferTest.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/../../runtime/buffer.h"
+#include "../../runtime/buffer.h"
 #include "CrashHandlerFixture.h"
 #include "gtest/gtest.h"
 #include <algorithm>
@@ -30,16 +30,15 @@ class Store : public FileFrame<Store, tinyBufferSize> {
   void set_expect(FileOffset to) { expect_ = to; }
 
   std::size_t Read(FileOffset at, char *to, std::size_t minBytes,
-                   std::size_t maxBytes, IoErrorHandler &handler) {
+      std::size_t maxBytes, IoErrorHandler &handler) {
     if (enforceSequence_ && at != expect_) {
       handler.SignalError("Read(%d,%d,%d) not at expected %d",
-                          static_cast<int>(at), static_cast<int>(minBytes),
-                          static_cast<int>(maxBytes),
-                          static_cast<int>(expect_));
+          static_cast<int>(at), static_cast<int>(minBytes),
+          static_cast<int>(maxBytes), static_cast<int>(expect_));
     } else if (at < 0 || at + minBytes > bytes_) {
       handler.SignalError("Read(%d,%d,%d) is out of bounds",
-                          static_cast<int>(at), static_cast<int>(minBytes),
-                          static_cast<int>(maxBytes));
+          static_cast<int>(at), static_cast<int>(minBytes),
+          static_cast<int>(maxBytes));
     }
     auto result{std::min<std::size_t>(maxBytes, bytes_ - at)};
     std::memcpy(to, &data_[at], result);
@@ -47,14 +46,14 @@ class Store : public FileFrame<Store, tinyBufferSize> {
     return result;
   }
   std::size_t Write(FileOffset at, const char *from, std::size_t bytes,
-                    IoErrorHandler &handler) {
+      IoErrorHandler &handler) {
     if (enforceSequence_ && at != expect_) {
       handler.SignalError("Write(%d,%d) not at expected %d",
-                          static_cast<int>(at), static_cast<int>(bytes),
-                          static_cast<int>(expect_));
+          static_cast<int>(at), static_cast<int>(bytes),
+          static_cast<int>(expect_));
     } else if (at < 0 || at + bytes > bytes_) {
       handler.SignalError("Write(%d,%d) is out of bounds", static_cast<int>(at),
-                          static_cast<int>(bytes));
+          static_cast<int>(bytes));
     }
     std::memcpy(&data_[at], from, bytes);
     expect_ = at + bytes;
@@ -71,8 +70,8 @@ class Store : public FileFrame<Store, tinyBufferSize> {
 inline int ChunkSize(int j, int most) {
   // 31, 1, 29, 3, 27, ...
   j %= tinyBufferSize;
-  auto chunk{static_cast<int>(((j % 2) ? j : (tinyBufferSize - 1 - j)) %
-                              tinyBufferSize)};
+  auto chunk{static_cast<int>(
+      ((j % 2) ? j : (tinyBufferSize - 1 - j)) % tinyBufferSize)};
   return std::min(chunk, most);
 }
 
diff --git a/flang-rt/unittests/FortranRuntime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt
similarity index 84%
rename from flang-rt/unittests/FortranRuntime/CMakeLists.txt
rename to flang/unittests/Runtime/CMakeLists.txt
index 3b03315794fcc2..23f02aa751246b 100644
--- a/flang-rt/unittests/FortranRuntime/CMakeLists.txt
+++ b/flang/unittests/Runtime/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_flang_rt_unittest(FortranRuntimeTests
+add_flang_unittest(FlangRuntimeTests
   Allocatable.cpp
   ArrayConstructor.cpp
   BufferTest.cpp
@@ -29,7 +29,7 @@ add_flang_rt_unittest(FortranRuntimeTests
   Transformational.cpp
 )
 
-target_link_libraries(FortranRuntimeTests
+target_link_libraries(FlangRuntimeTests
   PRIVATE
-  flang-rt
+  FortranRuntime
 )
diff --git a/flang-rt/unittests/FortranRuntime/CharacterTest.cpp b/flang/unittests/Runtime/CharacterTest.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/CharacterTest.cpp
rename to flang/unittests/Runtime/CharacterTest.cpp
diff --git a/flang-rt/unittests/FortranRuntime/CommandTest.cpp b/flang/unittests/Runtime/CommandTest.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/CommandTest.cpp
rename to flang/unittests/Runtime/CommandTest.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Complex.cpp b/flang/unittests/Runtime/Complex.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Complex.cpp
rename to flang/unittests/Runtime/Complex.cpp
diff --git a/flang-rt/unittests/FortranRuntime/CrashHandlerFixture.cpp b/flang/unittests/Runtime/CrashHandlerFixture.cpp
similarity index 86%
rename from flang-rt/unittests/FortranRuntime/CrashHandlerFixture.cpp
rename to flang/unittests/Runtime/CrashHandlerFixture.cpp
index 1ce8bf826cf27f..811603337e6608 100644
--- a/flang-rt/unittests/FortranRuntime/CrashHandlerFixture.cpp
+++ b/flang/unittests/Runtime/CrashHandlerFixture.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 #include "CrashHandlerFixture.h"
-#include "flang/../../runtime/terminator.h"
+#include "../../runtime/terminator.h"
 #include <cstdarg>
 #include <cstdlib>
 
 // Replaces Fortran runtime's crash handler so we can verify the crash message
-[[noreturn]] static void CatchCrash(const char *sourceFile, int sourceLine,
-                                    const char *message, va_list &ap) {
+[[noreturn]] static void CatchCrash(
+    const char *sourceFile, int sourceLine, const char *message, va_list &ap) {
   char buffer[1000];
   std::vsnprintf(buffer, sizeof buffer, message, ap);
   va_end(ap);
diff --git a/flang-rt/unittests/FortranRuntime/CrashHandlerFixture.h b/flang/unittests/Runtime/CrashHandlerFixture.h
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/CrashHandlerFixture.h
rename to flang/unittests/Runtime/CrashHandlerFixture.h
diff --git a/flang-rt/unittests/FortranRuntime/Derived.cpp b/flang/unittests/Runtime/Derived.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Derived.cpp
rename to flang/unittests/Runtime/Derived.cpp
diff --git a/flang-rt/unittests/FortranRuntime/ExternalIOTest.cpp b/flang/unittests/Runtime/ExternalIOTest.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/ExternalIOTest.cpp
rename to flang/unittests/Runtime/ExternalIOTest.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Format.cpp b/flang/unittests/Runtime/Format.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Format.cpp
rename to flang/unittests/Runtime/Format.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Inquiry.cpp b/flang/unittests/Runtime/Inquiry.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Inquiry.cpp
rename to flang/unittests/Runtime/Inquiry.cpp
diff --git a/flang-rt/unittests/FortranRuntime/ListInputTest.cpp b/flang/unittests/Runtime/ListInputTest.cpp
similarity index 83%
rename from flang-rt/unittests/FortranRuntime/ListInputTest.cpp
rename to flang/unittests/Runtime/ListInputTest.cpp
index c855ed93ee4b01..a4eba5283add68 100644
--- a/flang-rt/unittests/FortranRuntime/ListInputTest.cpp
+++ b/flang/unittests/Runtime/ListInputTest.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CrashHandlerFixture.h"
-#include "flang/../../runtime/io-error.h"
+#include "../../runtime/io-error.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/io-api.h"
 
@@ -36,15 +36,15 @@ TEST(InputTest, TestListInputAlphabet) {
   // Use _two_ input buffers and _three_ output buffers. Note the `3*` in the
   // _inputBuffers_.
   SetCharacter(inputBuffers[j++], maxInputBufferLength,
-               "3*'abcdefghijklmnopqrstuvwxyzABC");
-  SetCharacter(inputBuffers[j++], maxInputBufferLength,
-               "DEFGHIJKLMNOPQRSTUVWXYZ'");
+      "3*'abcdefghijklmnopqrstuvwxyzABC");
+  SetCharacter(
+      inputBuffers[j++], maxInputBufferLength, "DEFGHIJKLMNOPQRSTUVWXYZ'");
 
   StaticDescriptor<1> staticDescriptor;
   Descriptor &whole{staticDescriptor.descriptor()};
   SubscriptValue extent[]{numInputBuffers};
   whole.Establish(TypeCode{CFI_type_char}, maxInputBufferLength, &inputBuffers,
-                  1, extent, CFI_attribute_pointer);
+      1, extent, CFI_attribute_pointer);
   whole.Check();
   auto *cookie{IONAME(BeginInternalArrayListInput)(whole)};
 
@@ -79,7 +79,7 @@ TEST(InputTest, TestListInputIntegerList) {
   Descriptor &whole{staticDescriptor.descriptor()};
   SubscriptValue extent[]{numBuffers};
   whole.Establish(TypeCode{CFI_type_char}, maxBufferLength, &buffer, 1, extent,
-                  CFI_attribute_pointer);
+      CFI_attribute_pointer);
   whole.Check();
   auto *cookie{IONAME(BeginInternalArrayListInput)(whole)};
 
@@ -88,10 +88,10 @@ TEST(InputTest, TestListInputIntegerList) {
   // Negative numbers will be overwritten by _expectedOutput_, and positive
   // numbers will not be as their indices are "Null values" of the Fortran 2018
   // standard 13.10.3.2 in the format strings _buffer_
-  std::int64_t actualOutput[listInputLength]{-1, -2, -3, -4, 5,
-                                             -6, 7,  -8, 9,  10};
-  const std::int64_t expectedOutput[listInputLength]{1, 2, 3, 3, 5,
-                                                     6, 7, 8, 9, 10};
+  std::int64_t actualOutput[listInputLength]{
+      -1, -2, -3, -4, 5, -6, 7, -8, 9, 10};
+  const std::int64_t expectedOutput[listInputLength]{
+      1, 2, 3, 3, 5, 6, 7, 8, 9, 10};
   for (j = 0; j < listInputLength; ++j) {
     IONAME(InputInteger)(cookie, actualOutput[j]);
   }
@@ -116,7 +116,7 @@ TEST(InputTest, TestListInputInvalidFormatWithSingleSuccess) {
   Descriptor &whole{staticDescriptor.descriptor()};
   SubscriptValue extent[]{numBuffers};
   whole.Establish(TypeCode{CFI_type_char}, formatBuffer.size(),
-                  formatBuffer.data(), 1, extent, CFI_attribute_pointer);
+      formatBuffer.data(), 1, extent, CFI_attribute_pointer);
   whole.Check();
 
   auto *cookie{IONAME(BeginInternalArrayListInput)(whole)};
@@ -127,7 +127,7 @@ TEST(InputTest, TestListInputInvalidFormatWithSingleSuccess) {
 
   // Perform failing InputInteger
   ASSERT_DEATH(IONAME(InputInteger)(cookie, dummy),
-               "Bad character 'g' in INTEGER input field");
+      "Bad character 'g' in INTEGER input field");
 }
 
 // Same test as _TestListInputInvalidFormatWithSingleSuccess_, however no
@@ -140,7 +140,7 @@ TEST(InputTest, TestListInputInvalidFormat) {
   Descriptor &whole{staticDescriptor.descriptor()};
   SubscriptValue extent[]{numBuffers};
   whole.Establish(TypeCode{CFI_type_char}, formatBuffer.size(),
-                  formatBuffer.data(), 1, extent, CFI_attribute_pointer);
+      formatBuffer.data(), 1, extent, CFI_attribute_pointer);
   whole.Check();
 
   auto *cookie{IONAME(BeginInternalArrayListInput)(whole)};
@@ -148,7 +148,7 @@ TEST(InputTest, TestListInputInvalidFormat) {
 
   // Perform failing InputInteger
   ASSERT_DEATH(IONAME(InputInteger)(cookie, dummy),
-               "Bad character 'g' in INTEGER input field");
+      "Bad character 'g' in INTEGER input field");
 }
 
 using ParamTy = std::tuple<std::string, std::vector<int>>;
@@ -163,7 +163,7 @@ TEST_P(SimpleListInputTest, TestListInput) {
   Descriptor &whole{staticDescriptor.descriptor()};
   SubscriptValue extent[]{numBuffers};
   whole.Establish(TypeCode{CFI_type_char}, formatBuffer.size(),
-                  formatBuffer.data(), 1, extent, CFI_attribute_pointer);
+      formatBuffer.data(), 1, extent, CFI_attribute_pointer);
   whole.Check();
   auto *cookie{IONAME(BeginInternalArrayListInput)(whole)};
 
@@ -185,10 +185,9 @@ TEST_P(SimpleListInputTest, TestListInput) {
   }
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    SimpleListInputTestInstantiation, SimpleListInputTest,
+INSTANTIATE_TEST_SUITE_P(SimpleListInputTestInstantiation, SimpleListInputTest,
     testing::Values(std::make_tuple("", std::vector<int>{}),
-                    std::make_tuple("0", std::vector<int>{}),
-                    std::make_tuple("1", std::vector<int>{1}),
-                    std::make_tuple("1, 2", std::vector<int>{1, 2}),
-                    std::make_tuple("3*2", std::vector<int>{2, 2, 2})));
+        std::make_tuple("0", std::vector<int>{}),
+        std::make_tuple("1", std::vector<int>{1}),
+        std::make_tuple("1, 2", std::vector<int>{1, 2}),
+        std::make_tuple("3*2", std::vector<int>{2, 2, 2})));
diff --git a/flang-rt/unittests/FortranRuntime/LogicalFormatTest.cpp b/flang/unittests/Runtime/LogicalFormatTest.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/LogicalFormatTest.cpp
rename to flang/unittests/Runtime/LogicalFormatTest.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Matmul.cpp b/flang/unittests/Runtime/Matmul.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Matmul.cpp
rename to flang/unittests/Runtime/Matmul.cpp
diff --git a/flang-rt/unittests/FortranRuntime/MatmulTranspose.cpp b/flang/unittests/Runtime/MatmulTranspose.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/MatmulTranspose.cpp
rename to flang/unittests/Runtime/MatmulTranspose.cpp
diff --git a/flang-rt/unittests/FortranRuntime/MiscIntrinsic.cpp b/flang/unittests/Runtime/MiscIntrinsic.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/MiscIntrinsic.cpp
rename to flang/unittests/Runtime/MiscIntrinsic.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Namelist.cpp b/flang/unittests/Runtime/Namelist.cpp
similarity index 73%
rename from flang-rt/unittests/FortranRuntime/Namelist.cpp
rename to flang/unittests/Runtime/Namelist.cpp
index 2e3c5c2919297f..5911d67f0d5fc4 100644
--- a/flang-rt/unittests/FortranRuntime/Namelist.cpp
+++ b/flang/unittests/Runtime/Namelist.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/../../runtime/namelist.h"
+#include "../../runtime/namelist.h"
 #include "CrashHandlerFixture.h"
+#include "tools.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/io-api.h"
-#include "tools.h"
 #include <algorithm>
 #include <cinttypes>
 #include <complex>
@@ -27,7 +27,7 @@ struct NamelistTests : CrashHandlerFixture {};
 
 static void ClearDescriptorStorage(const Descriptor &descriptor) {
   std::memset(descriptor.raw().base_addr, 0,
-              descriptor.Elements() * descriptor.ElementBytes());
+      descriptor.Elements() * descriptor.ElementBytes());
 }
 
 TEST(NamelistTests, BasicSanity) {
@@ -38,20 +38,17 @@ TEST(NamelistTests, BasicSanity) {
   Descriptor &internalDesc{statDescs[0].descriptor()};
   SubscriptValue extent[]{numLines};
   internalDesc.Establish(TypeCode{CFI_type_char}, /*elementBytes=*/lineLength,
-                         &buffer, 1, extent, CFI_attribute_pointer);
+      &buffer, 1, extent, CFI_attribute_pointer);
   // Set up data arrays
   std::vector<int> ints;
   for (int j{0}; j < 20; ++j) {
     ints.push_back(j % 2 == 0 ? (1 << j) : -(1 << j));
   }
-  std::vector<double> reals{0.0,
-                            -0.0,
-                            std::numeric_limits<double>::infinity(),
-                            -std::numeric_limits<double>::infinity(),
-                            std::numeric_limits<double>::quiet_NaN(),
-                            std::numeric_limits<double>::max(),
-                            std::numeric_limits<double>::lowest(),
-                            std::numeric_limits<double>::epsilon()};
+  std::vector<double> reals{0.0, -0.0, std::numeric_limits<double>::infinity(),
+      -std::numeric_limits<double>::infinity(),
+      std::numeric_limits<double>::quiet_NaN(),
+      std::numeric_limits<double>::max(), std::numeric_limits<double>::lowest(),
+      std::numeric_limits<double>::epsilon()};
   std::vector<std::uint8_t> logicals;
   logicals.push_back(false);
   logicals.push_back(true);
@@ -79,14 +76,12 @@ TEST(NamelistTests, BasicSanity) {
   // Create a NAMELIST group
   static constexpr int items{5};
   const NamelistGroup::Item itemArray[items]{{"ints", *intDesc},
-                                             {"reals", *realDesc},
-                                             {"logicals", *logicalDesc},
-                                             {"complexes", *complexDesc},
-                                             {"characters", *characterDesc}};
+      {"reals", *realDesc}, {"logicals", *logicalDesc},
+      {"complexes", *complexDesc}, {"characters", *characterDesc}};
   const NamelistGroup group{"group1", items, itemArray};
   // Do an internal NAMELIST write and check results
-  auto outCookie1{IONAME(BeginInternalArrayListOutput)(internalDesc, nullptr, 0,
-                                                       __FILE__, __LINE__)};
+  auto outCookie1{IONAME(BeginInternalArrayListOutput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(SetDelim)(outCookie1, "APOSTROPHE", 10));
   ASSERT_TRUE(IONAME(OutputNamelist)(outCookie1, group));
   auto outStatus1{IONAME(EndIoStatement)(outCookie1)};
@@ -114,14 +109,14 @@ TEST(NamelistTests, BasicSanity) {
   ClearDescriptorStorage(*logicalDesc);
   ClearDescriptorStorage(*complexDesc);
   ClearDescriptorStorage(*characterDesc);
-  auto inCookie{IONAME(BeginInternalArrayListInput)(internalDesc, nullptr, 0,
-                                                    __FILE__, __LINE__)};
+  auto inCookie{IONAME(BeginInternalArrayListInput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(InputNamelist)(inCookie, group));
   auto inStatus{IONAME(EndIoStatement)(inCookie)};
   ASSERT_EQ(inStatus, 0) << "Failed namelist input sanity, status "
                          << static_cast<int>(inStatus);
-  auto outCookie2{IONAME(BeginInternalArrayListOutput)(internalDesc, nullptr, 0,
-                                                       __FILE__, __LINE__)};
+  auto outCookie2{IONAME(BeginInternalArrayListOutput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(SetDelim)(outCookie2, "APOSTROPHE", 10));
   ASSERT_TRUE(IONAME(OutputNamelist)(outCookie2, group));
   auto outStatus2{IONAME(EndIoStatement)(outCookie2)};
@@ -144,19 +139,18 @@ TEST(NamelistTests, Subscripts) {
   StaticDescriptor<1, true> statDesc;
   Descriptor &internalDesc{statDesc.descriptor()};
   internalDesc.Establish(TypeCode{CFI_type_char},
-                         /*elementBytes=*/std::strlen(t1), t1, 0, nullptr,
-                         CFI_attribute_pointer);
-  auto inCookie{IONAME(BeginInternalArrayListInput)(internalDesc, nullptr, 0,
-                                                    __FILE__, __LINE__)};
+      /*elementBytes=*/std::strlen(t1), t1, 0, nullptr, CFI_attribute_pointer);
+  auto inCookie{IONAME(BeginInternalArrayListInput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(InputNamelist)(inCookie, group));
   auto inStatus{IONAME(EndIoStatement)(inCookie)};
   ASSERT_EQ(inStatus, 0) << "Failed namelist input subscripts, status "
                          << static_cast<int>(inStatus);
   char out[40];
   internalDesc.Establish(TypeCode{CFI_type_char}, /*elementBytes=*/sizeof out,
-                         out, 0, nullptr, CFI_attribute_pointer);
-  auto outCookie{IONAME(BeginInternalArrayListOutput)(internalDesc, nullptr, 0,
-                                                      __FILE__, __LINE__)};
+      out, 0, nullptr, CFI_attribute_pointer);
+  auto outCookie{IONAME(BeginInternalArrayListOutput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(OutputNamelist)(outCookie, group));
   auto outStatus{IONAME(EndIoStatement)(outCookie)};
   ASSERT_EQ(outStatus, 0)
@@ -183,8 +177,8 @@ TEST(NamelistTests, ShortArrayInput) {
   Descriptor &internalDesc{statDesc.descriptor()};
   SubscriptValue shape{2};
   internalDesc.Establish(1, 12, t1, 1, &shape, CFI_attribute_pointer);
-  auto inCookie{IONAME(BeginInternalArrayListInput)(internalDesc, nullptr, 0,
-                                                    __FILE__, __LINE__)};
+  auto inCookie{IONAME(BeginInternalArrayListInput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(InputNamelist)(inCookie, group));
   auto inStatus{IONAME(EndIoStatement)(inCookie)};
   ASSERT_EQ(inStatus, 0) << "Failed namelist input subscripts, status "
@@ -204,18 +198,17 @@ TEST(NamelistTests, ScalarSubstring) {
   StaticDescriptor<1, true> statDesc;
   Descriptor &internalDesc{statDesc.descriptor()};
   internalDesc.Establish(TypeCode{CFI_type_char},
-                         /*elementBytes=*/std::strlen(t1), t1, 0, nullptr,
-                         CFI_attribute_pointer);
-  auto inCookie{IONAME(BeginInternalArrayListInput)(internalDesc, nullptr, 0,
-                                                    __FILE__, __LINE__)};
+      /*elementBytes=*/std::strlen(t1), t1, 0, nullptr, CFI_attribute_pointer);
+  auto inCookie{IONAME(BeginInternalArrayListInput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(InputNamelist)(inCookie, group));
   ASSERT_EQ(IONAME(EndIoStatement)(inCookie), IostatOk)
       << "namelist scalar substring input";
   char out[32];
   internalDesc.Establish(TypeCode{CFI_type_char}, /*elementBytes=*/sizeof out,
-                         out, 0, nullptr, CFI_attribute_pointer);
-  auto outCookie{IONAME(BeginInternalArrayListOutput)(internalDesc, nullptr, 0,
-                                                      __FILE__, __LINE__)};
+      out, 0, nullptr, CFI_attribute_pointer);
+  auto outCookie{IONAME(BeginInternalArrayListOutput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(SetDelim)(outCookie, "apostrophe", 10));
   ASSERT_TRUE(IONAME(OutputNamelist)(outCookie, group));
   ASSERT_EQ(IONAME(EndIoStatement)(outCookie), IostatOk) << "namelist output";
@@ -225,27 +218,26 @@ TEST(NamelistTests, ScalarSubstring) {
 }
 
 TEST(NamelistTests, ArraySubstring) {
-  OwningPtr<Descriptor> scDesc{MakeArray<TypeCategory::Character, 1>(
-      std::vector<int>{2}, std::vector<std::string>{"abcdefgh", "ijklmnop"},
-      8)};
+  OwningPtr<Descriptor> scDesc{
+      MakeArray<TypeCategory::Character, 1>(std::vector<int>{2},
+          std::vector<std::string>{"abcdefgh", "ijklmnop"}, 8)};
   const NamelistGroup::Item items[]{{"a", *scDesc}};
   const NamelistGroup group{"justa", 1, items};
   static char t1[]{"&justa A(:)(2:+5)='BCDE' 'JKLM'/"};
   StaticDescriptor<1, true> statDesc;
   Descriptor &internalDesc{statDesc.descriptor()};
   internalDesc.Establish(TypeCode{CFI_type_char},
-                         /*elementBytes=*/std::strlen(t1), t1, 0, nullptr,
-                         CFI_attribute_pointer);
-  auto inCookie{IONAME(BeginInternalArrayListInput)(internalDesc, nullptr, 0,
-                                                    __FILE__, __LINE__)};
+      /*elementBytes=*/std::strlen(t1), t1, 0, nullptr, CFI_attribute_pointer);
+  auto inCookie{IONAME(BeginInternalArrayListInput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(InputNamelist)(inCookie, group));
   ASSERT_EQ(IONAME(EndIoStatement)(inCookie), IostatOk)
       << "namelist scalar substring input";
   char out[40];
   internalDesc.Establish(TypeCode{CFI_type_char}, /*elementBytes=*/sizeof out,
-                         out, 0, nullptr, CFI_attribute_pointer);
-  auto outCookie{IONAME(BeginInternalArrayListOutput)(internalDesc, nullptr, 0,
-                                                      __FILE__, __LINE__)};
+      out, 0, nullptr, CFI_attribute_pointer);
+  auto outCookie{IONAME(BeginInternalArrayListOutput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(SetDelim)(outCookie, "apostrophe", 10));
   ASSERT_TRUE(IONAME(OutputNamelist)(outCookie, group));
   ASSERT_EQ(IONAME(EndIoStatement)(outCookie), IostatOk) << "namelist output";
@@ -264,18 +256,17 @@ TEST(NamelistTests, Skip) {
   StaticDescriptor<1, true> statDesc;
   Descriptor &internalDesc{statDesc.descriptor()};
   internalDesc.Establish(TypeCode{CFI_type_char},
-                         /*elementBytes=*/std::strlen(t1), t1, 0, nullptr,
-                         CFI_attribute_pointer);
-  auto inCookie{IONAME(BeginInternalArrayListInput)(internalDesc, nullptr, 0,
-                                                    __FILE__, __LINE__)};
+      /*elementBytes=*/std::strlen(t1), t1, 0, nullptr, CFI_attribute_pointer);
+  auto inCookie{IONAME(BeginInternalArrayListInput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(InputNamelist)(inCookie, group));
   ASSERT_EQ(IONAME(EndIoStatement)(inCookie), IostatOk)
       << "namelist input with skipping";
   char out[20];
   internalDesc.Establish(TypeCode{CFI_type_char}, /*elementBytes=*/sizeof out,
-                         out, 0, nullptr, CFI_attribute_pointer);
-  auto outCookie{IONAME(BeginInternalArrayListOutput)(internalDesc, nullptr, 0,
-                                                      __FILE__, __LINE__)};
+      out, 0, nullptr, CFI_attribute_pointer);
+  auto outCookie{IONAME(BeginInternalArrayListOutput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(OutputNamelist)(outCookie, group));
   ASSERT_EQ(IONAME(EndIoStatement)(outCookie), IostatOk) << "namelist output";
   std::string got{out, sizeof out};
@@ -294,19 +285,18 @@ TEST(NamelistTests, Comma) {
   StaticDescriptor<1, true> statDesc;
   Descriptor &internalDesc{statDesc.descriptor()};
   internalDesc.Establish(TypeCode{CFI_type_char},
-                         /*elementBytes=*/std::strlen(t1), t1, 0, nullptr,
-                         CFI_attribute_pointer);
-  auto inCookie{IONAME(BeginInternalArrayListInput)(internalDesc, nullptr, 0,
-                                                    __FILE__, __LINE__)};
+      /*elementBytes=*/std::strlen(t1), t1, 0, nullptr, CFI_attribute_pointer);
+  auto inCookie{IONAME(BeginInternalArrayListInput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(SetDecimal)(inCookie, "COMMA", 5));
   ASSERT_TRUE(IONAME(InputNamelist)(inCookie, group));
   ASSERT_EQ(IONAME(EndIoStatement)(inCookie), IostatOk)
       << "namelist input with skipping";
   char out[30];
   internalDesc.Establish(TypeCode{CFI_type_char}, /*elementBytes=*/sizeof out,
-                         out, 0, nullptr, CFI_attribute_pointer);
-  auto outCookie{IONAME(BeginInternalArrayListOutput)(internalDesc, nullptr, 0,
-                                                      __FILE__, __LINE__)};
+      out, 0, nullptr, CFI_attribute_pointer);
+  auto outCookie{IONAME(BeginInternalArrayListOutput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
   ASSERT_TRUE(IONAME(SetDecimal)(outCookie, "COMMA", 5));
   ASSERT_TRUE(IONAME(OutputNamelist)(outCookie, group));
   ASSERT_EQ(IONAME(EndIoStatement)(outCookie), IostatOk) << "namelist output";
diff --git a/flang-rt/unittests/FortranRuntime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Numeric.cpp
rename to flang/unittests/Runtime/Numeric.cpp
diff --git a/flang-rt/unittests/FortranRuntime/NumericalFormatTest.cpp b/flang/unittests/Runtime/NumericalFormatTest.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/NumericalFormatTest.cpp
rename to flang/unittests/Runtime/NumericalFormatTest.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Pointer.cpp b/flang/unittests/Runtime/Pointer.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Pointer.cpp
rename to flang/unittests/Runtime/Pointer.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Ragged.cpp b/flang/unittests/Runtime/Ragged.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Ragged.cpp
rename to flang/unittests/Runtime/Ragged.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Random.cpp b/flang/unittests/Runtime/Random.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Random.cpp
rename to flang/unittests/Runtime/Random.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Reduction.cpp b/flang/unittests/Runtime/Reduction.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Reduction.cpp
rename to flang/unittests/Runtime/Reduction.cpp
diff --git a/flang-rt/unittests/FortranRuntime/RuntimeCrashTest.cpp b/flang/unittests/Runtime/RuntimeCrashTest.cpp
similarity index 66%
rename from flang-rt/unittests/FortranRuntime/RuntimeCrashTest.cpp
rename to flang/unittests/Runtime/RuntimeCrashTest.cpp
index 580ffe69f24480..34a8ff6d371131 100644
--- a/flang-rt/unittests/FortranRuntime/RuntimeCrashTest.cpp
+++ b/flang/unittests/Runtime/RuntimeCrashTest.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 #include "CrashHandlerFixture.h"
-#include "flang/../../runtime/terminator.h"
+#include "tools.h"
+#include "../../runtime/terminator.h"
 #include "flang/Runtime/io-api.h"
 #include "flang/Runtime/transformational.h"
-#include "tools.h"
 #include <gtest/gtest.h>
 
 using namespace Fortran::runtime;
@@ -26,7 +26,7 @@ using Fortran::common::TypeCategory;
 //------------------------------------------------------------------------------
 struct TestTerminator : CrashHandlerFixture {};
 
-#define TEST_CRASH_HANDLER_MESSAGE                                             \
+#define TEST_CRASH_HANDLER_MESSAGE \
   "Intentionally crashing runtime for unit test"
 
 TEST(TestTerminator, CrashTest) {
@@ -39,13 +39,13 @@ TEST(TestTerminator, CrashTest) {
 TEST(TestTerminator, CheckFailedLocationTest) {
   static Fortran::runtime::Terminator t;
   ASSERT_DEATH(t.CheckFailed("predicate", "someFileName", 789),
-               "RUNTIME_CHECK\\(predicate\\) failed at someFileName\\(789\\)");
+      "RUNTIME_CHECK\\(predicate\\) failed at someFileName\\(789\\)");
 }
 
 TEST(TestTerminator, CheckFailedTest) {
   static Fortran::runtime::Terminator t;
   ASSERT_DEATH(t.CheckFailed("predicate"),
-               "RUNTIME_CHECK\\(predicate\\) failed at \\(null\\)\\(0\\)");
+      "RUNTIME_CHECK\\(predicate\\) failed at \\(null\\)\\(0\\)");
 }
 
 //------------------------------------------------------------------------------
@@ -57,10 +57,9 @@ TEST(TestIOCrash, FormatDescriptorWriteMismatchTest) {
   static constexpr int bufferSize{4};
   static char buffer[bufferSize];
   static const char *format{"(A4)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                    std::strlen(format))};
-  ASSERT_DEATH(
-      IONAME(OutputLogical)(cookie, true),
+  auto *cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format))};
+  ASSERT_DEATH(IONAME(OutputLogical)(cookie, true),
       "Data edit descriptor 'A' may not be used with a LOGICAL data item");
 }
 
@@ -68,10 +67,10 @@ TEST(TestIOCrash, InvalidFormatCharacterTest) {
   static constexpr int bufferSize{1};
   static char buffer[bufferSize];
   static const char *format{"(C1)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                    std::strlen(format))};
+  auto *cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format))};
   ASSERT_DEATH(IONAME(OutputInteger64)(cookie, 0xfeedface),
-               "Unknown 'C' edit descriptor in FORMAT");
+      "Unknown 'C' edit descriptor in FORMAT");
 }
 
 //------------------------------------------------------------------------------
@@ -84,80 +83,80 @@ TEST(TestIOCrash, OverwriteBufferAsciiTest) {
   static constexpr int bufferSize{4};
   static char buffer[bufferSize];
   static const char *format{"(A4)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                    std::strlen(format))};
+  auto *cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format))};
   IONAME(OutputAscii)(cookie, "four", bufferSize);
   ASSERT_DEATH(IONAME(OutputAscii)(cookie, "Too many characters!", 20),
-               "Internal write overran available records");
+      "Internal write overran available records");
 }
 
 TEST(TestIOCrash, OverwriteBufferCharacterTest) {
   static constexpr int bufferSize{1};
   static char buffer[bufferSize];
   static const char *format{"(A1)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                    std::strlen(format))};
+  auto *cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format))};
   IONAME(OutputCharacter)(cookie, "a", 1);
   ASSERT_DEATH(IONAME(OutputCharacter)(cookie, "a", 1),
-               "Internal write overran available records");
+      "Internal write overran available records");
 }
 
 TEST(TestIOCrash, OverwriteBufferLogicalTest) {
   static constexpr int bufferSize{1};
   static char buffer[bufferSize];
   static const char *format{"(L1)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                    std::strlen(format))};
+  auto *cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format))};
   IONAME(OutputLogical)(cookie, true);
   ASSERT_DEATH(IONAME(OutputLogical)(cookie, true),
-               "Internal write overran available records");
+      "Internal write overran available records");
 }
 
 TEST(TestIOCrash, OverwriteBufferRealTest) {
   static constexpr int bufferSize{1};
   static char buffer[bufferSize];
   static const char *format{"(F1)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                    std::strlen(format))};
+  auto *cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format))};
   IONAME(OutputReal32)(cookie, 1.);
   EXPECT_DEATH(IONAME(OutputReal32)(cookie, 1.),
-               "Internal write overran available records");
+      "Internal write overran available records");
 
   std::memset(buffer, '\0', bufferSize);
-  cookie = IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                std::strlen(format));
+  cookie = IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format));
   IONAME(OutputReal64)(cookie, 1.);
   EXPECT_DEATH(IONAME(OutputReal64)(cookie, 1.),
-               "Internal write overran available records");
+      "Internal write overran available records");
 }
 
 TEST(TestIOCrash, OverwriteBufferComplexTest) {
   static constexpr int bufferSize{8};
   static char buffer[bufferSize];
   static const char *format{"(Z1,Z1)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                    std::strlen(format))};
+  auto *cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format))};
   IONAME(OutputComplex32)(cookie, 1., 1.);
   EXPECT_DEATH(IONAME(OutputComplex32)(cookie, 1., 1.),
-               "Internal write overran available records");
+      "Internal write overran available records");
 
   std::memset(buffer, '\0', bufferSize);
-  cookie = IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                std::strlen(format));
+  cookie = IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format));
   IONAME(OutputComplex64)(cookie, 1., 1.);
   EXPECT_DEATH(IONAME(OutputComplex64)(cookie, 1., 1.),
-               "Internal write overran available records");
+      "Internal write overran available records");
 }
 
 TEST(TestIOCrash, OverwriteBufferIntegerTest) {
   static constexpr int bufferSize{1};
   static char buffer[bufferSize];
   static const char *format{"(I1)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(buffer, bufferSize, format,
-                                                    std::strlen(format))};
+  auto *cookie{IONAME(BeginInternalFormattedOutput)(
+      buffer, bufferSize, format, std::strlen(format))};
   IONAME(OutputInteger64)(cookie, 0xdeadbeef);
   ASSERT_DEATH(IONAME(OutputInteger64)(cookie, 0xdeadbeef),
-               "Internal write overran available records");
+      "Internal write overran available records");
 }
 
 //------------------------------------------------------------------------------
@@ -169,15 +168,13 @@ TEST(TestIntrinsicCrash, ConformityErrors) {
   // ARRAY(2,3) and MASK(2,4) should trigger a runtime error.
   auto array{MakeArray<TypeCategory::Integer, 4>(
       std::vector<int>{2, 3}, std::vector<std::int32_t>{1, 2, 3, 4, 5, 6})};
-  auto mask{MakeArray<TypeCategory::Logical, 1>(
-      std::vector<int>{2, 4},
-      std::vector<std::uint8_t>{false, true, true, false, false, true, true,
-                                true})};
+  auto mask{MakeArray<TypeCategory::Logical, 1>(std::vector<int>{2, 4},
+      std::vector<std::uint8_t>{
+          false, true, true, false, false, true, true, true})};
   StaticDescriptor<1, true> statDesc;
   Descriptor &result{statDesc.descriptor()};
 
-  ASSERT_DEATH(
-      RTNAME(Pack)(result, *array, *mask, nullptr, __FILE__, __LINE__),
+  ASSERT_DEATH(RTNAME(Pack)(result, *array, *mask, nullptr, __FILE__, __LINE__),
       "Incompatible array arguments to PACK: dimension 2 of ARRAY= has extent "
       "3 but MASK= has extent 4");
 }
diff --git a/flang-rt/unittests/FortranRuntime/Stop.cpp b/flang/unittests/Runtime/Stop.cpp
similarity index 54%
rename from flang-rt/unittests/FortranRuntime/Stop.cpp
rename to flang/unittests/Runtime/Stop.cpp
index 1c5611dd396833..b13602eaee5ea6 100644
--- a/flang-rt/unittests/FortranRuntime/Stop.cpp
+++ b/flang/unittests/Runtime/Stop.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 #include "flang/Runtime/stop.h"
 #include "CrashHandlerFixture.h"
-#include "flang/../../runtime/environment.h"
+#include "../../runtime/environment.h"
 #include <cstdlib>
 #include <gtest/gtest.h>
 
@@ -21,69 +21,67 @@ struct TestProgramEnd : CrashHandlerFixture {};
 
 TEST(TestProgramEnd, StopTest) {
   EXPECT_EXIT(RTNAME(StopStatement)(), testing::ExitedWithCode(EXIT_SUCCESS),
-              "Fortran STOP");
+      "Fortran STOP");
 }
 
 TEST(TestProgramEnd, StopTestNoStopMessage) {
   putenv(const_cast<char *>("NO_STOP_MESSAGE=1"));
-  Fortran::runtime::executionEnvironment.Configure(0, nullptr, nullptr,
-                                                   nullptr);
-  EXPECT_EXIT(RTNAME(StopStatement)(), testing::ExitedWithCode(EXIT_SUCCESS),
-              "");
+  Fortran::runtime::executionEnvironment.Configure(
+      0, nullptr, nullptr, nullptr);
+  EXPECT_EXIT(
+      RTNAME(StopStatement)(), testing::ExitedWithCode(EXIT_SUCCESS), "");
 }
 
 TEST(TestProgramEnd, StopMessageTest) {
   static const char *message{"bye bye"};
   EXPECT_EXIT(RTNAME(StopStatementText)(message, std::strlen(message),
-                                        /*isErrorStop=*/false, /*quiet=*/false),
-              testing::ExitedWithCode(EXIT_SUCCESS), "Fortran STOP: bye bye");
+                  /*isErrorStop=*/false, /*quiet=*/false),
+      testing::ExitedWithCode(EXIT_SUCCESS), "Fortran STOP: bye bye");
 
   EXPECT_EXIT(RTNAME(StopStatementText)(message, std::strlen(message),
-                                        /*isErrorStop=*/false, /*quiet=*/true),
-              testing::ExitedWithCode(EXIT_SUCCESS), "");
+                  /*isErrorStop=*/false, /*quiet=*/true),
+      testing::ExitedWithCode(EXIT_SUCCESS), "");
 
   EXPECT_EXIT(RTNAME(StopStatementText)(message, std::strlen(message),
-                                        /*isErrorStop=*/true, /*quiet=*/false),
-              testing::ExitedWithCode(EXIT_FAILURE),
-              "Fortran ERROR STOP: bye bye");
+                  /*isErrorStop=*/true, /*quiet=*/false),
+      testing::ExitedWithCode(EXIT_FAILURE), "Fortran ERROR STOP: bye bye");
 
   EXPECT_EXIT(RTNAME(StopStatementText)(message, std::strlen(message),
-                                        /*isErrorStop=*/true, /*quiet=*/true),
-              testing::ExitedWithCode(EXIT_FAILURE), "");
+                  /*isErrorStop=*/true, /*quiet=*/true),
+      testing::ExitedWithCode(EXIT_FAILURE), "");
 }
 
 TEST(TestProgramEnd, NoStopMessageTest) {
   putenv(const_cast<char *>("NO_STOP_MESSAGE=1"));
-  Fortran::runtime::executionEnvironment.Configure(0, nullptr, nullptr,
-                                                   nullptr);
+  Fortran::runtime::executionEnvironment.Configure(
+      0, nullptr, nullptr, nullptr);
   static const char *message{"bye bye"};
   EXPECT_EXIT(RTNAME(StopStatementText)(message, std::strlen(message),
-                                        /*isErrorStop=*/false, /*quiet=*/false),
-              testing::ExitedWithCode(EXIT_SUCCESS), "bye bye");
+                  /*isErrorStop=*/false, /*quiet=*/false),
+      testing::ExitedWithCode(EXIT_SUCCESS), "bye bye");
 
   EXPECT_EXIT(RTNAME(StopStatementText)(message, std::strlen(message),
-                                        /*isErrorStop=*/false, /*quiet=*/true),
-              testing::ExitedWithCode(EXIT_SUCCESS), "");
+                  /*isErrorStop=*/false, /*quiet=*/true),
+      testing::ExitedWithCode(EXIT_SUCCESS), "");
 
   EXPECT_EXIT(RTNAME(StopStatementText)(message, std::strlen(message),
-                                        /*isErrorStop=*/true, /*quiet=*/false),
-              testing::ExitedWithCode(EXIT_FAILURE),
-              "Fortran ERROR STOP: bye bye");
+                  /*isErrorStop=*/true, /*quiet=*/false),
+      testing::ExitedWithCode(EXIT_FAILURE), "Fortran ERROR STOP: bye bye");
 
   EXPECT_EXIT(RTNAME(StopStatementText)(message, std::strlen(message),
-                                        /*isErrorStop=*/true, /*quiet=*/true),
-              testing::ExitedWithCode(EXIT_FAILURE), "");
+                  /*isErrorStop=*/true, /*quiet=*/true),
+      testing::ExitedWithCode(EXIT_FAILURE), "");
 }
 
 TEST(TestProgramEnd, FailImageTest) {
-  EXPECT_EXIT(RTNAME(FailImageStatement)(),
-              testing::ExitedWithCode(EXIT_FAILURE), "");
+  EXPECT_EXIT(
+      RTNAME(FailImageStatement)(), testing::ExitedWithCode(EXIT_FAILURE), "");
 }
 
 TEST(TestProgramEnd, ExitTest) {
   EXPECT_EXIT(RTNAME(Exit)(), testing::ExitedWithCode(EXIT_SUCCESS), "");
-  EXPECT_EXIT(RTNAME(Exit)(EXIT_FAILURE), testing::ExitedWithCode(EXIT_FAILURE),
-              "");
+  EXPECT_EXIT(
+      RTNAME(Exit)(EXIT_FAILURE), testing::ExitedWithCode(EXIT_FAILURE), "");
 }
 
 TEST(TestProgramEnd, AbortTest) { EXPECT_DEATH(RTNAME(Abort)(), ""); }
@@ -93,8 +91,8 @@ TEST(TestProgramEnd, CrashTest) {
   static const std::string fileName{"file name"};
   static const std::string headMessage{"fatal Fortran runtime error\\("};
   static const std::string tailMessage{":343\\): "};
-  static const std::string fullMessage{headMessage + fileName + tailMessage +
-                                       crashMessage};
+  static const std::string fullMessage{
+      headMessage + fileName + tailMessage + crashMessage};
   EXPECT_DEATH(
       RTNAME(ReportFatalUserError)(crashMessage.c_str(), fileName.c_str(), 343),
       fullMessage.c_str());
diff --git a/flang-rt/unittests/FortranRuntime/TemporaryStack.cpp b/flang/unittests/Runtime/TemporaryStack.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/TemporaryStack.cpp
rename to flang/unittests/Runtime/TemporaryStack.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Time.cpp b/flang/unittests/Runtime/Time.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Time.cpp
rename to flang/unittests/Runtime/Time.cpp
diff --git a/flang-rt/unittests/FortranRuntime/Transformational.cpp b/flang/unittests/Runtime/Transformational.cpp
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/Transformational.cpp
rename to flang/unittests/Runtime/Transformational.cpp
diff --git a/flang-rt/unittests/FortranRuntime/tools.h b/flang/unittests/Runtime/tools.h
similarity index 100%
rename from flang-rt/unittests/FortranRuntime/tools.h
rename to flang/unittests/Runtime/tools.h
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index 05189e993babbc..eb06cd9c08af28 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -115,7 +115,7 @@ namespace internal {
 
 // Returns floor(log_10(2^e)); requires 0 <= e <= 42039.
 LIBC_INLINE constexpr uint32_t log10_pow2(const uint64_t e) {
-  LIBC_ASSERT(e >= 0 && e <= 42039 &&
+  LIBC_ASSERT(e <= 42039 &&
               "Incorrect exponent to perform log10_pow2 approximation.");
   // This approximation is based on the float value for log_10(2). It first
   // gives an incorrect result for our purposes at 42039 (well beyond the 16383
@@ -708,7 +708,7 @@ FloatToString<long double>::get_negative_block(int block_index) {
     const int32_t SHIFT_CONST = TABLE_SHIFT_CONST;
 
     // if the requested block is zero
-    if (block_index <= MIN_BLOCK_2[idx]) {
+    if (block_index < MIN_BLOCK_2[idx]) {
       return 0;
     }
     const uint32_t p = POW10_OFFSET_2[idx] + block_index - MIN_BLOCK_2[idx];
diff --git a/libc/src/__support/wctype_utils.h b/libc/src/__support/wctype_utils.h
index 2c8eef8a5c0b70..6d825499a1b0fd 100644
--- a/libc/src/__support/wctype_utils.h
+++ b/libc/src/__support/wctype_utils.h
@@ -28,6 +28,8 @@ namespace internal {
 LIBC_INLINE cpp::optional<int> wctob(wint_t c) {
   // This needs to be translated to EOF at the callsite. This is to avoid
   // including stdio.h in this file.
+  // The standard states that wint_t may either be an alias of wchar_t or
+  // an alias of an integer type, so we need to keep the c < 0 check.
   if (c > 127 || c < 0)
     return cpp::nullopt;
   return static_cast<int>(c);
diff --git a/libc/src/string/memory_utils/generic/builtin.h b/libc/src/string/memory_utils/generic/builtin.h
new file mode 100644
index 00000000000000..5239329f653b34
--- /dev/null
+++ b/libc/src/string/memory_utils/generic/builtin.h
@@ -0,0 +1,41 @@
+//===-- Trivial builtin implementations  ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_BUILTIN_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_BUILTIN_H
+
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
+#include "src/string/memory_utils/utils.h"   // Ptr, CPtr
+
+#include <stddef.h> // size_t
+
+namespace LIBC_NAMESPACE {
+
+static_assert(LIBC_HAS_BUILTIN(__builtin_memcpy), "Builtin not defined");
+static_assert(LIBC_HAS_BUILTIN(__builtin_memset), "Builtin not defined");
+static_assert(LIBC_HAS_BUILTIN(__builtin_memmove), "Builtin not defined");
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_builtin(Ptr dst, CPtr src, size_t count, size_t offset = 0) {
+  __builtin_memcpy(dst + offset, src + offset, count);
+}
+
+[[maybe_unused]] LIBC_INLINE void inline_memmove_builtin(Ptr dst, CPtr src,
+                                                         size_t count) {
+  __builtin_memmove(dst, src, count);
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_builtin(Ptr dst, uint8_t value, size_t count, size_t offset = 0) {
+  __builtin_memset(dst + offset, value, count);
+}
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_BUILTIN_H
diff --git a/libc/src/string/memory_utils/inline_memcpy.h b/libc/src/string/memory_utils/inline_memcpy.h
index 0b8a7848da87b4..a92bf4ddf881d5 100644
--- a/libc/src/string/memory_utils/inline_memcpy.h
+++ b/libc/src/string/memory_utils/inline_memcpy.h
@@ -28,9 +28,12 @@
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "src/string/memory_utils/riscv/inline_memcpy.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY inline_memcpy_riscv
-#elif defined(LIBC_TARGET_ARCH_IS_ARM) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
 #include "src/string/memory_utils/generic/byte_per_byte.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY inline_memcpy_byte_per_byte
+#elif defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "src/string/memory_utils/generic/builtin.h"
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY inline_memcpy_builtin
 #else
 #error "Unsupported architecture"
 #endif
diff --git a/libc/src/string/memory_utils/inline_memmove.h b/libc/src/string/memory_utils/inline_memmove.h
index 0d31e10eaff28e..f72ea24ab538d6 100644
--- a/libc/src/string/memory_utils/inline_memmove.h
+++ b/libc/src/string/memory_utils/inline_memmove.h
@@ -20,9 +20,12 @@
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "src/string/memory_utils/riscv/inline_memmove.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_riscv
-#elif defined(LIBC_TARGET_ARCH_IS_ARM) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
 #include "src/string/memory_utils/generic/byte_per_byte.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_byte_per_byte
+#elif defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "src/string/memory_utils/generic/builtin.h"
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_builtin
 #else
 #error "Unsupported architecture"
 #endif
diff --git a/libc/src/string/memory_utils/inline_memset.h b/libc/src/string/memory_utils/inline_memset.h
index f20ae45fa753b4..1c07c1ca4bffc0 100644
--- a/libc/src/string/memory_utils/inline_memset.h
+++ b/libc/src/string/memory_utils/inline_memset.h
@@ -24,9 +24,12 @@
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "src/string/memory_utils/riscv/inline_memset.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_riscv
-#elif defined(LIBC_TARGET_ARCH_IS_ARM) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
 #include "src/string/memory_utils/generic/byte_per_byte.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_byte_per_byte
+#elif defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "src/string/memory_utils/generic/builtin.h"
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_builtin
 #else
 #error "Unsupported architecture"
 #endif
diff --git a/libc/startup/linux/x86_64/CMakeLists.txt b/libc/startup/linux/x86_64/CMakeLists.txt
index 40f01836c13134..076c0c3e444f56 100644
--- a/libc/startup/linux/x86_64/CMakeLists.txt
+++ b/libc/startup/linux/x86_64/CMakeLists.txt
@@ -10,6 +10,7 @@ add_startup_object(
     libc.src.__support.threads.thread
     libc.src.__support.OSUtil.osutil
     libc.src.stdlib.exit
+    libc.src.stdlib.abort
     libc.src.stdlib.atexit
     libc.src.string.memory_utils.inline_memcpy
     libc.src.unistd.environ
diff --git a/libc/startup/linux/x86_64/start.cpp b/libc/startup/linux/x86_64/start.cpp
index 4b46a4f4a7b081..af95d2702ded97 100644
--- a/libc/startup/linux/x86_64/start.cpp
+++ b/libc/startup/linux/x86_64/start.cpp
@@ -7,8 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "config/linux/app.h"
+#include "src/__support/OSUtil/io.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/threads/thread.h"
+#include "src/stdlib/abort.h"
 #include "src/stdlib/atexit.h"
 #include "src/stdlib/exit.h"
 #include "src/string/memory_utils/inline_memcpy.h"
@@ -23,6 +25,11 @@
 
 extern "C" int main(int, char **, char **);
 
+extern "C" void __stack_chk_fail() {
+  LIBC_NAMESPACE::write_to_stderr("stack smashing detected");
+  LIBC_NAMESPACE::abort();
+}
+
 namespace LIBC_NAMESPACE {
 
 #ifdef SYS_mmap2
@@ -54,7 +61,9 @@ void init_tls(TLSDescriptor &tls_descriptor) {
   // Per the x86_64 TLS ABI, the entry pointed to by the thread pointer is the
   // address of the TLS block. So, we add more size to accomodate this address
   // entry.
-  uintptr_t tlsSizeWithAddr = tlsSize + sizeof(uintptr_t);
+  // We also need to include space for the stack canary. The canary is at
+  // offset 0x28 (40) and is of size uintptr_t.
+  uintptr_t tlsSizeWithAddr = tlsSize + sizeof(uintptr_t) + 40;
 
   // We cannot call the mmap function here as the functions set errno on
   // failure. Since errno is implemented via a thread local variable, we cannot
@@ -76,6 +85,16 @@ void init_tls(TLSDescriptor &tls_descriptor) {
   LIBC_NAMESPACE::inline_memcpy(reinterpret_cast<char *>(tlsAddr),
                                 reinterpret_cast<const char *>(app.tls.address),
                                 app.tls.init_size);
+  uintptr_t *stackGuardAddr = reinterpret_cast<uintptr_t *>(endPtr + 40);
+  // Setting the stack guard to a random value.
+  // We cannot call the get_random function here as the function sets errno on
+  // failure. Since errno is implemented via a thread local variable, we cannot
+  // use errno before TLS is setup.
+  ssize_t stackGuardRetVal = LIBC_NAMESPACE::syscall_impl<ssize_t>(
+      SYS_getrandom, reinterpret_cast<long>(stackGuardAddr), sizeof(uint64_t),
+      0);
+  if (stackGuardRetVal < 0)
+    LIBC_NAMESPACE::syscall_impl(SYS_exit, 1);
 
   tls_descriptor = {tlsSizeWithAddr, uintptr_t(tlsAddr), endPtr};
   return;
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 0c1f340449236e..fb60916a0402f0 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -156,17 +156,21 @@ template <TestCond C, typename T> FPMatcher<T, C> getMatcher(T expectedValue) {
   do {                                                                         \
     using namespace LIBC_NAMESPACE::fputil::testing;                           \
     ForceRoundingMode __r1(RoundingMode::Nearest);                             \
-    if (__r1.success)                                                          \
+    if (__r1.success) {                                                        \
       EXPECT_FP_EQ((expected), (actual));                                      \
+    }                                                                          \
     ForceRoundingMode __r2(RoundingMode::Upward);                              \
-    if (__r2.success)                                                          \
+    if (__r2.success) {                                                        \
       EXPECT_FP_EQ((expected), (actual));                                      \
+    }                                                                          \
     ForceRoundingMode __r3(RoundingMode::Downward);                            \
-    if (__r3.success)                                                          \
+    if (__r3.success) {                                                        \
       EXPECT_FP_EQ((expected), (actual));                                      \
+    }                                                                          \
     ForceRoundingMode __r4(RoundingMode::TowardZero);                          \
-    if (__r4.success)                                                          \
+    if (__r4.success) {                                                        \
       EXPECT_FP_EQ((expected), (actual));                                      \
+    }                                                                          \
   } while (0)
 
 #endif // LLVM_LIBC_UTILS_UNITTEST_FPMATCHER_H
diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp
index 2f18d5778e71d8..b01f40f0526461 100644
--- a/libc/test/UnitTest/LibcTest.cpp
+++ b/libc/test/UnitTest/LibcTest.cpp
@@ -95,6 +95,7 @@ bool test(RunContext *Ctx, TestCond Cond, ValType LHS, ValType RHS,
   case TestCond::GE:
     return ExplainDifference(LHS >= RHS, "greater than or equal to");
   }
+  __builtin_unreachable();
 }
 
 } // namespace internal
diff --git a/libc/test/UnitTest/LibcTest.h b/libc/test/UnitTest/LibcTest.h
index dc17c3c49d4277..68fd5f28f3e0bd 100644
--- a/libc/test/UnitTest/LibcTest.h
+++ b/libc/test/UnitTest/LibcTest.h
@@ -378,19 +378,6 @@ CString libc_make_test_file_path_func(const char *file_name);
   SuiteClass##_##TestName SuiteClass##_##TestName##_Instance;                  \
   void SuiteClass##_##TestName::Run()
 
-// The GNU compiler emits a warning if nested "if" statements are followed by
-// an "else" statement and braces are not used to explicitly disambiguate the
-// "else" binding.  This leads to problems with code like:
-//
-//   if (gate)
-//     ASSERT_*(condition) << "Some message";
-//
-// The "switch (0) case 0:" idiom is used to suppress this.
-#define LIBC_AMBIGUOUS_ELSE_BLOCKER_                                           \
-  switch (0)                                                                   \
-  case 0:                                                                      \
-  default:
-
 // If RET_OR_EMPTY is the 'return' keyword we perform an early return which
 // corresponds to an assert. If it is empty the execution continues, this
 // corresponds to an expect.
@@ -402,7 +389,6 @@ CString libc_make_test_file_path_func(const char *file_name);
 // returning a boolean. This expression is responsible for logging the
 // diagnostic in case of failure.
 #define LIBC_TEST_SCAFFOLDING_(TEST, RET_OR_EMPTY)                             \
-  LIBC_AMBIGUOUS_ELSE_BLOCKER_                                                 \
   if (TEST)                                                                    \
     ;                                                                          \
   else                                                                         \
diff --git a/libc/test/integration/src/unistd/CMakeLists.txt b/libc/test/integration/src/unistd/CMakeLists.txt
index 72fdb8fc7e6b01..10aac212af355e 100644
--- a/libc/test/integration/src/unistd/CMakeLists.txt
+++ b/libc/test/integration/src/unistd/CMakeLists.txt
@@ -33,6 +33,29 @@ add_integration_test(
     libc.src.unistd.fork
 )
 
+if((${LIBC_TARGET_OS} STREQUAL "linux") AND (${LIBC_TARGET_ARCHITECTURE_IS_X86}))
+  add_integration_test(
+    stack_smashing_test
+    SUITE
+      unistd-integration-tests
+    SRCS
+      stack_smashing_test.cpp
+    DEPENDS
+      libc.include.errno
+      libc.include.signal
+      libc.include.sys_wait
+      libc.include.unistd
+      libc.src.pthread.pthread_atfork
+      libc.src.signal.raise
+      libc.src.sys.wait.wait
+      libc.src.sys.wait.wait4
+      libc.src.sys.wait.waitpid
+      libc.src.unistd.fork
+    COMPILE_OPTIONS
+      -fstack-protector-all
+  )
+endif()
+
 add_executable(
   libc_execv_test_normal_exit
   EXCLUDE_FROM_ALL
diff --git a/libc/test/integration/src/unistd/stack_smashing_test.cpp b/libc/test/integration/src/unistd/stack_smashing_test.cpp
new file mode 100644
index 00000000000000..89fc53dac506ac
--- /dev/null
+++ b/libc/test/integration/src/unistd/stack_smashing_test.cpp
@@ -0,0 +1,68 @@
+//===--- Stack smashing test to check stack canary set up  ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/string.h"
+#include "src/__support/OSUtil/io.h"
+#include "src/pthread/pthread_atfork.h"
+#include "src/signal/raise.h"
+#include "src/sys/wait/wait.h"
+#include "src/sys/wait/wait4.h"
+#include "src/sys/wait/waitpid.h"
+#include "src/unistd/fork.h"
+
+#include "test/IntegrationTest/test.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+void no_stack_smashing_normal_exit() {
+  pid_t pid = LIBC_NAMESPACE::fork();
+  if (pid == 0) {
+    // Child process
+    char foo[30];
+    for (int i = 0; i < 30; i++)
+      foo[i] = (foo[i] != 42) ? 42 : 24;
+    return;
+  }
+  ASSERT_TRUE(pid > 0);
+  int status;
+  pid_t cpid = LIBC_NAMESPACE::wait(&status);
+  ASSERT_TRUE(cpid > 0);
+  ASSERT_EQ(cpid, pid);
+  ASSERT_TRUE(WIFEXITED(status));
+}
+
+void stack_smashing_abort() {
+  pid_t pid = LIBC_NAMESPACE::fork();
+  if (pid == 0) {
+    // Child process
+    char foo[30];
+    char *frame_ptr = static_cast<char *>(__builtin_frame_address(0));
+    char *cur_ptr = &foo[0];
+    // Corrupt the stack
+    while (cur_ptr != frame_ptr) {
+      *cur_ptr = (*cur_ptr != 42) ? 42 : 24;
+      cur_ptr++;
+    }
+    return;
+  }
+  ASSERT_TRUE(pid > 0);
+  int status;
+  pid_t cpid = LIBC_NAMESPACE::wait(&status);
+  ASSERT_TRUE(cpid > 0);
+  ASSERT_EQ(cpid, pid);
+  ASSERT_TRUE(WTERMSIG(status) == SIGABRT);
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  no_stack_smashing_normal_exit();
+  stack_smashing_abort();
+  return 0;
+}
diff --git a/libc/test/src/__support/CPP/bitset_test.cpp b/libc/test/src/__support/CPP/bitset_test.cpp
index f1d4e4c2f96364..3632e54907ceb4 100644
--- a/libc/test/src/__support/CPP/bitset_test.cpp
+++ b/libc/test/src/__support/CPP/bitset_test.cpp
@@ -194,7 +194,7 @@ TEST(LlvmLibcBitsetTest, SetRangeTest) {
   // Check setting exactly one unit
   bitset.set_range(0, 63);
   for (size_t j = 0; j < 256; ++j)
-    EXPECT_EQ(bitset.test(j), (j >= 0 && j <= 63));
+    EXPECT_EQ(bitset.test(j), j <= 63);
   bitset.reset();
 
   // Check ranges across unit boundaries work.
diff --git a/libc/test/src/__support/CPP/type_traits_test.cpp b/libc/test/src/__support/CPP/type_traits_test.cpp
index 0e5f4a1f738b7a..a2051f38027532 100644
--- a/libc/test/src/__support/CPP/type_traits_test.cpp
+++ b/libc/test/src/__support/CPP/type_traits_test.cpp
@@ -157,7 +157,7 @@ struct A {
 
 struct B : public A {
   virtual ~B() {}
-  virtual void apply() { state = B_APPLY_CALLED; }
+  virtual void apply() override { state = B_APPLY_CALLED; }
 };
 
 void free_function() {}
diff --git a/libc/test/src/__support/FPUtil/rounding_mode_test.cpp b/libc/test/src/__support/FPUtil/rounding_mode_test.cpp
index e73d10d9658841..8077a5aab7afde 100644
--- a/libc/test/src/__support/FPUtil/rounding_mode_test.cpp
+++ b/libc/test/src/__support/FPUtil/rounding_mode_test.cpp
@@ -19,23 +19,27 @@ TEST(LlvmLibcFEnvImplTest, QuickRoundingUpTest) {
   using LIBC_NAMESPACE::fputil::fenv_is_round_up;
   {
     ForceRoundingMode __r(RoundingMode::Upward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_TRUE(fenv_is_round_up());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Downward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_up());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Nearest);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_up());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::TowardZero);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_up());
+    }
   }
 }
 
@@ -43,23 +47,27 @@ TEST(LlvmLibcFEnvImplTest, QuickRoundingDownTest) {
   using LIBC_NAMESPACE::fputil::fenv_is_round_down;
   {
     ForceRoundingMode __r(RoundingMode::Upward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_down());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Downward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_TRUE(fenv_is_round_down());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Nearest);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_down());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::TowardZero);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_down());
+    }
   }
 }
 
@@ -67,23 +75,27 @@ TEST(LlvmLibcFEnvImplTest, QuickRoundingNearestTest) {
   using LIBC_NAMESPACE::fputil::fenv_is_round_to_nearest;
   {
     ForceRoundingMode __r(RoundingMode::Upward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_to_nearest());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Downward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_to_nearest());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Nearest);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_TRUE(fenv_is_round_to_nearest());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::TowardZero);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_to_nearest());
+    }
   }
 }
 
@@ -91,23 +103,27 @@ TEST(LlvmLibcFEnvImplTest, QuickRoundingTowardZeroTest) {
   using LIBC_NAMESPACE::fputil::fenv_is_round_to_zero;
   {
     ForceRoundingMode __r(RoundingMode::Upward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_to_zero());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Downward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_to_zero());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Nearest);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_FALSE(fenv_is_round_to_zero());
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::TowardZero);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_TRUE(fenv_is_round_to_zero());
+    }
   }
 }
 
@@ -115,22 +131,26 @@ TEST(LlvmLibcFEnvImplTest, QuickGetRoundTest) {
   using LIBC_NAMESPACE::fputil::quick_get_round;
   {
     ForceRoundingMode __r(RoundingMode::Upward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_EQ(quick_get_round(), FE_UPWARD);
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Downward);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_EQ(quick_get_round(), FE_DOWNWARD);
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::Nearest);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_EQ(quick_get_round(), FE_TONEAREST);
+    }
   }
   {
     ForceRoundingMode __r(RoundingMode::TowardZero);
-    if (__r.success)
+    if (__r.success) {
       ASSERT_EQ(quick_get_round(), FE_TOWARDZERO);
+    }
   }
 }
diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h
index 1740770fd6c478..3ff169091cc9a3 100644
--- a/libc/test/src/math/smoke/FrexpTest.h
+++ b/libc/test/src/math/smoke/FrexpTest.h
@@ -93,4 +93,4 @@ template <typename T> class FrexpTest : public LIBC_NAMESPACE::testing::Test {
   using LlvmLibcFrexpTest = FrexpTest<T>;                                      \
   TEST_F(LlvmLibcFrexpTest, SpecialNumbers) { testSpecialNumbers(&func); }     \
   TEST_F(LlvmLibcFrexpTest, PowersOfTwo) { testPowersOfTwo(&func); }           \
-  TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); }\
+  TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); }
diff --git a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
index a343710829f10b..008167e07221f4 100644
--- a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
+++ b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
@@ -40,12 +40,15 @@ TEST(LlvmLibcPosixSpawnFileActionsTest, AddActions) {
   int action_count = 0;
   while (act != nullptr) {
     ++action_count;
-    if (action_count == 1)
+    if (action_count == 1) {
       ASSERT_EQ(act->type, LIBC_NAMESPACE::BaseSpawnFileAction::CLOSE);
-    if (action_count == 2)
+    }
+    if (action_count == 2) {
       ASSERT_EQ(act->type, LIBC_NAMESPACE::BaseSpawnFileAction::DUP2);
-    if (action_count == 3)
+    }
+    if (action_count == 3) {
       ASSERT_EQ(act->type, LIBC_NAMESPACE::BaseSpawnFileAction::OPEN);
+    }
     act = act->next;
   }
   ASSERT_EQ(action_count, 3);
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index 8b9c919bed203d..b7e8b754858810 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -2433,6 +2433,9 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%g", 9999999000000.00);
   ASSERT_STREQ_LEN(written, buff, "1e+13");
 
+  written = LIBC_NAMESPACE::sprintf(buff, "%g", 0xa.aaaaaaaaaaaaaabp-7);
+  ASSERT_STREQ_LEN(written, buff, "0.0833333");
+
   // Simple Subnormal Tests.
 
   written = LIBC_NAMESPACE::sprintf(buff, "%g", 0x1.0p-1027);
@@ -2457,9 +2460,16 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
 
   // Length Modifier Tests.
 
+#if defined(SPECIAL_X86_LONG_DOUBLE)
+
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xf.fffffffffffffffp+16380L);
   ASSERT_STREQ_LEN(written, buff, "1.18973e+4932");
 
+  written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xa.aaaaaaaaaaaaaabp-7L);
+  ASSERT_STREQ_LEN(written, buff, "0.0833333");
+
+#endif // SPECIAL_X86_LONG_DOUBLE
+
   // TODO: Uncomment the below tests after long double support is added
   /*
   written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L);
@@ -2757,6 +2767,15 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%.10g", 0x1.0p-1074);
   ASSERT_STREQ_LEN(written, buff, "4.940656458e-324");
 
+#if defined(SPECIAL_X86_LONG_DOUBLE)
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%.60Lg", 0xa.aaaaaaaaaaaaaabp-7L);
+  ASSERT_STREQ_LEN(
+      written, buff,
+      "0.0833333333333333333355920878593448009041821933351457118988037");
+
+#endif // SPECIAL_X86_LONG_DOUBLE
+
   // Long double precision tests.
   // These are currently commented out because they require long double support
   // that isn't ready yet.
diff --git a/libc/test/src/time/TmHelper.h b/libc/test/src/time/TmHelper.h
index 19202031915df1..d8e638d8dbaffb 100644
--- a/libc/test/src/time/TmHelper.h
+++ b/libc/test/src/time/TmHelper.h
@@ -31,7 +31,8 @@ static inline void initialize_tm_data(struct tm *tm_data, int year, int month,
                     // years since 1900
                     .tm_year = year - TimeConstants::TIME_YEAR_BASE,
                     .tm_wday = wday,
-                    .tm_yday = yday};
+                    .tm_yday = yday,
+                    .tm_isdst = 0};
   *tm_data = temp;
 }
 
diff --git a/libc/test/src/time/mktime_test.cpp b/libc/test/src/time/mktime_test.cpp
index 753363ec397586..6f179150953b7f 100644
--- a/libc/test/src/time/mktime_test.cpp
+++ b/libc/test/src/time/mktime_test.cpp
@@ -27,7 +27,7 @@ TEST(LlvmLibcMkTime, FailureSetsErrno) {
   struct tm tm_data {
     .tm_sec = INT_MAX, .tm_min = INT_MAX, .tm_hour = INT_MAX,
     .tm_mday = INT_MAX, .tm_mon = INT_MAX - 1, .tm_year = tm_year(INT_MAX),
-    .tm_wday = 0, .tm_yday = 0
+    .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
   };
   EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
 }
@@ -38,7 +38,7 @@ TEST(LlvmLibcMkTime, InvalidSeconds) {
     struct tm tm_data {
       .tm_sec = -1, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-1));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
@@ -48,7 +48,8 @@ TEST(LlvmLibcMkTime, InvalidSeconds) {
                      .tm_mon = Month::DECEMBER,
                      .tm_year = tm_year(1969),
                      .tm_wday = 3,
-                     .tm_yday = 364}),
+                     .tm_yday = 364,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 
@@ -57,7 +58,7 @@ TEST(LlvmLibcMkTime, InvalidSeconds) {
     struct tm tm_data {
       .tm_sec = 60, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(60));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
@@ -67,7 +68,8 @@ TEST(LlvmLibcMkTime, InvalidSeconds) {
                      .tm_mon = Month::JANUARY,
                      .tm_year = tm_year(1970),
                      .tm_wday = 4,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 }
@@ -78,7 +80,7 @@ TEST(LlvmLibcMkTime, InvalidMinutes) {
     struct tm tm_data {
       .tm_sec = 0, .tm_min = -1, .tm_hour = 0, .tm_mday = 1,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(-TimeConstants::SECONDS_PER_MIN));
@@ -89,7 +91,8 @@ TEST(LlvmLibcMkTime, InvalidMinutes) {
                      .tm_mon = Month::DECEMBER,
                      .tm_year = tm_year(1969),
                      .tm_wday = 3,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 
@@ -98,7 +101,7 @@ TEST(LlvmLibcMkTime, InvalidMinutes) {
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 60, .tm_hour = 0, .tm_mday = 1,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(60 * TimeConstants::SECONDS_PER_MIN));
@@ -109,7 +112,8 @@ TEST(LlvmLibcMkTime, InvalidMinutes) {
                      .tm_mon = Month::JANUARY,
                      .tm_year = tm_year(1970),
                      .tm_wday = 4,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 }
@@ -120,7 +124,7 @@ TEST(LlvmLibcMkTime, InvalidHours) {
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 0, .tm_hour = -1, .tm_mday = 1,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(-TimeConstants::SECONDS_PER_HOUR));
@@ -131,7 +135,8 @@ TEST(LlvmLibcMkTime, InvalidHours) {
                      .tm_mon = Month::DECEMBER,
                      .tm_year = tm_year(1969),
                      .tm_wday = 3,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 
@@ -140,7 +145,7 @@ TEST(LlvmLibcMkTime, InvalidHours) {
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 0, .tm_hour = 24, .tm_mday = 1,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(24 * TimeConstants::SECONDS_PER_HOUR));
@@ -151,7 +156,8 @@ TEST(LlvmLibcMkTime, InvalidHours) {
                      .tm_mon = Month::JANUARY,
                      .tm_year = tm_year(1970),
                      .tm_wday = 5,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 }
@@ -161,7 +167,7 @@ TEST(LlvmLibcMkTime, InvalidYear) {
   struct tm tm_data {
     .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
     .tm_mon = Month::JANUARY, .tm_year = tm_year(1969), .tm_wday = 0,
-    .tm_yday = 0
+    .tm_yday = 0, .tm_isdst = 0
   };
   EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
               Succeeds(-TimeConstants::DAYS_PER_NON_LEAP_YEAR *
@@ -173,7 +179,8 @@ TEST(LlvmLibcMkTime, InvalidYear) {
                    .tm_mon = Month::JANUARY,
                    .tm_year = tm_year(1969),
                    .tm_wday = 3,
-                   .tm_yday = 0}),
+                   .tm_yday = 0,
+                   .tm_isdst = 0}),
                tm_data);
 }
 
@@ -185,7 +192,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
     struct tm tm_data {
       .tm_sec = 8, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
@@ -195,7 +202,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
     struct tm tm_data {
       .tm_sec = 7, .tm_min = 15, .tm_hour = 3, .tm_mday = 19,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
@@ -205,7 +212,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
     struct tm tm_data {
       .tm_sec = 7, .tm_min = 14, .tm_hour = 4, .tm_mday = 19,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
@@ -215,7 +222,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
     struct tm tm_data {
       .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 20,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
@@ -225,7 +232,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
     struct tm tm_data {
       .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
       .tm_mon = Month::FEBRUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
@@ -235,7 +242,7 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
     struct tm tm_data {
       .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(2039), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
@@ -246,7 +253,7 @@ TEST(LlvmLibcMkTime, InvalidMonths) {
     // -1 month from 1970-01-01 00:00:00 returns 1969-12-01 00:00:00.
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 0, .tm_mon = -1,
-      .tm_year = tm_year(1970), .tm_wday = 0, .tm_yday = 0
+      .tm_year = tm_year(1970), .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(-32 * TimeConstants::SECONDS_PER_DAY));
@@ -257,7 +264,8 @@ TEST(LlvmLibcMkTime, InvalidMonths) {
                      .tm_mon = Month::DECEMBER,
                      .tm_year = tm_year(1969),
                      .tm_wday = 1,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 
@@ -265,7 +273,7 @@ TEST(LlvmLibcMkTime, InvalidMonths) {
     // 1970-13-01 00:00:00 returns 1971-01-01 00:00:00.
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 1, .tm_mon = 12,
-      .tm_year = tm_year(1970), .tm_wday = 0, .tm_yday = 0
+      .tm_year = tm_year(1970), .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(TimeConstants::DAYS_PER_NON_LEAP_YEAR *
@@ -277,7 +285,8 @@ TEST(LlvmLibcMkTime, InvalidMonths) {
                      .tm_mon = Month::JANUARY,
                      .tm_year = tm_year(1971),
                      .tm_wday = 5,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 }
@@ -288,7 +297,7 @@ TEST(LlvmLibcMkTime, InvalidDays) {
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = (1 - 1),
       .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(-1 * TimeConstants::SECONDS_PER_DAY));
@@ -299,7 +308,8 @@ TEST(LlvmLibcMkTime, InvalidDays) {
                      .tm_mon = Month::DECEMBER,
                      .tm_year = tm_year(1969),
                      .tm_wday = 3,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 
@@ -308,7 +318,7 @@ TEST(LlvmLibcMkTime, InvalidDays) {
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 32,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(31 * TimeConstants::SECONDS_PER_DAY));
@@ -319,7 +329,8 @@ TEST(LlvmLibcMkTime, InvalidDays) {
                      .tm_mon = Month::FEBRUARY,
                      .tm_year = tm_year(1970),
                      .tm_wday = 0,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 
@@ -328,7 +339,7 @@ TEST(LlvmLibcMkTime, InvalidDays) {
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 29,
       .tm_mon = Month::FEBRUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(59 * TimeConstants::SECONDS_PER_DAY));
@@ -339,7 +350,8 @@ TEST(LlvmLibcMkTime, InvalidDays) {
                      .tm_mon = Month::MARCH,
                      .tm_year = tm_year(1970),
                      .tm_wday = 0,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 
@@ -348,7 +360,7 @@ TEST(LlvmLibcMkTime, InvalidDays) {
     struct tm tm_data {
       .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 30,
       .tm_mon = Month::FEBRUARY, .tm_year = tm_year(1972), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
                 Succeeds(((2 * TimeConstants::DAYS_PER_NON_LEAP_YEAR) + 60) *
@@ -360,7 +372,8 @@ TEST(LlvmLibcMkTime, InvalidDays) {
                      .tm_mon = Month::MARCH,
                      .tm_year = tm_year(1972),
                      .tm_wday = 3,
-                     .tm_yday = 0}),
+                     .tm_yday = 0,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 }
@@ -371,7 +384,7 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   struct tm tm_data {
     .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
     .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-    .tm_yday = 0
+    .tm_yday = 0, .tm_isdst = 0
   };
   EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF));
   EXPECT_TM_EQ((tm{.tm_sec = 7,
@@ -381,7 +394,8 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
                    .tm_mon = Month::JANUARY,
                    .tm_year = tm_year(2038),
                    .tm_wday = 2,
-                   .tm_yday = 7}),
+                   .tm_yday = 7,
+                   .tm_isdst = 0}),
                tm_data);
 }
 
@@ -393,7 +407,7 @@ TEST(LlvmLibcMkTime, Max64BitYear) {
     struct tm tm_data {
       .tm_sec = 50, .tm_min = 50, .tm_hour = 12, .tm_mday = 1,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(2170), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(6311479850));
     EXPECT_TM_EQ((tm{.tm_sec = 50,
@@ -403,7 +417,8 @@ TEST(LlvmLibcMkTime, Max64BitYear) {
                      .tm_mon = Month::JANUARY,
                      .tm_year = tm_year(2170),
                      .tm_wday = 1,
-                     .tm_yday = 50}),
+                     .tm_yday = 50,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 
@@ -412,7 +427,7 @@ TEST(LlvmLibcMkTime, Max64BitYear) {
     struct tm tm_data {
       .tm_sec = 50, .tm_min = 50, .tm_hour = 12, .tm_mday = 1,
       .tm_mon = Month::JANUARY, .tm_year = tm_year(2147483647), .tm_wday = 0,
-      .tm_yday = 0
+      .tm_yday = 0, .tm_isdst = 0
     };
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(67767976202043050));
     EXPECT_TM_EQ((tm{.tm_sec = 50,
@@ -422,7 +437,8 @@ TEST(LlvmLibcMkTime, Max64BitYear) {
                      .tm_mon = Month::JANUARY,
                      .tm_year = tm_year(2147483647),
                      .tm_wday = 2,
-                     .tm_yday = 50}),
+                     .tm_yday = 50,
+                     .tm_isdst = 0}),
                  tm_data);
   }
 }
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index e23a3c2034cb28..0818955f14de7d 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -72,6 +72,7 @@ static inline mpfr_rnd_t get_mpfr_rounding_mode(RoundingMode mode) {
     return MPFR_RNDN;
     break;
   }
+  __builtin_unreachable();
 }
 
 class MPFRNumber {
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index baac2d0e3a410e..b9573e89f25a4a 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -356,21 +356,25 @@ template <typename T> bool round_to_long(T x, RoundingMode mode, long &result);
   {                                                                            \
     namespace mpfr = LIBC_NAMESPACE::testing::mpfr;                            \
     mpfr::ForceRoundingMode __r1(mpfr::RoundingMode::Nearest);                 \
-    if (__r1.success)                                                          \
+    if (__r1.success) {                                                        \
       EXPECT_MPFR_MATCH(op, input, match_value, ulp_tolerance,                 \
                         mpfr::RoundingMode::Nearest);                          \
+    }                                                                          \
     mpfr::ForceRoundingMode __r2(mpfr::RoundingMode::Upward);                  \
-    if (__r2.success)                                                          \
+    if (__r2.success) {                                                        \
       EXPECT_MPFR_MATCH(op, input, match_value, ulp_tolerance,                 \
                         mpfr::RoundingMode::Upward);                           \
+    }                                                                          \
     mpfr::ForceRoundingMode __r3(mpfr::RoundingMode::Downward);                \
-    if (__r3.success)                                                          \
+    if (__r3.success) {                                                        \
       EXPECT_MPFR_MATCH(op, input, match_value, ulp_tolerance,                 \
                         mpfr::RoundingMode::Downward);                         \
+    }                                                                          \
     mpfr::ForceRoundingMode __r4(mpfr::RoundingMode::TowardZero);              \
-    if (__r4.success)                                                          \
+    if (__r4.success) {                                                        \
       EXPECT_MPFR_MATCH(op, input, match_value, ulp_tolerance,                 \
                         mpfr::RoundingMode::TowardZero);                       \
+    }                                                                          \
   }
 
 #define TEST_MPFR_MATCH_ROUNDING_SILENTLY(op, input, match_value,              \
@@ -400,21 +404,25 @@ template <typename T> bool round_to_long(T x, RoundingMode mode, long &result);
   {                                                                            \
     namespace mpfr = LIBC_NAMESPACE::testing::mpfr;                            \
     mpfr::ForceRoundingMode __r1(mpfr::RoundingMode::Nearest);                 \
-    if (__r1.success)                                                          \
+    if (__r1.success) {                                                        \
       ASSERT_MPFR_MATCH(op, input, match_value, ulp_tolerance,                 \
                         mpfr::RoundingMode::Nearest);                          \
+    }                                                                          \
     mpfr::ForceRoundingMode __r2(mpfr::RoundingMode::Upward);                  \
-    if (__r2.success)                                                          \
+    if (__r2.success) {                                                        \
       ASSERT_MPFR_MATCH(op, input, match_value, ulp_tolerance,                 \
                         mpfr::RoundingMode::Upward);                           \
+    }                                                                          \
     mpfr::ForceRoundingMode __r3(mpfr::RoundingMode::Downward);                \
-    if (__r3.success)                                                          \
+    if (__r3.success) {                                                        \
       ASSERT_MPFR_MATCH(op, input, match_value, ulp_tolerance,                 \
                         mpfr::RoundingMode::Downward);                         \
+    }                                                                          \
     mpfr::ForceRoundingMode __r4(mpfr::RoundingMode::TowardZero);              \
-    if (__r4.success)                                                          \
+    if (__r4.success) {                                                        \
       ASSERT_MPFR_MATCH(op, input, match_value, ulp_tolerance,                 \
                         mpfr::RoundingMode::TowardZero);                       \
+    }                                                                          \
   }
 
 #endif // LLVM_LIBC_UTILS_TESTUTILS_MPFRUTILS_H
diff --git a/libclc/utils/prepare-builtins.cpp b/libclc/utils/prepare-builtins.cpp
index 550b5971913f48..ebdbc68cfee3b3 100644
--- a/libclc/utils/prepare-builtins.cpp
+++ b/libclc/utils/prepare-builtins.cpp
@@ -5,23 +5,27 @@
 #include "llvm/Bitcode/ReaderWriter.h"
 #endif
 
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <system_error>
 
 using namespace llvm;
 
+static ExitOnError ExitOnErr;
+
 static cl::opt<std::string>
 InputFilename(cl::Positional, cl::desc("<input bitcode>"), cl::init("-"));
 
@@ -29,6 +33,9 @@ static cl::opt<std::string>
 OutputFilename("o", cl::desc("Output filename"),
                cl::value_desc("filename"));
 
+static cl::opt<bool> TextualOut("S", cl::desc("Emit LLVM textual assembly"),
+                                cl::init(false));
+
 int main(int argc, char **argv) {
   LLVMContext Context;
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
@@ -45,17 +52,15 @@ int main(int argc, char **argv) {
       ErrorMessage = ec.message();
     } else {
       std::unique_ptr<MemoryBuffer> &BufferPtr = BufferOrErr.get();
-      ErrorOr<std::unique_ptr<Module>> ModuleOrErr =
+      SMDiagnostic Err;
+      std::unique_ptr<llvm::Module> MPtr =
 #if HAVE_LLVM > 0x0390
-          expectedToErrorOrAndEmitErrors(Context,
-          parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context));
+          ExitOnErr(Expected<std::unique_ptr<llvm::Module>>(
+              parseIR(BufferPtr.get()->getMemBufferRef(), Err, Context)));
 #else
-          parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context);
+          parseIR(BufferPtr.get()->getMemBufferRef(), Err, Context);
 #endif
-      if (std::error_code ec = ModuleOrErr.getError())
-        ErrorMessage = ec.message();
-
-      M = ModuleOrErr.get().release();
+      M = MPtr.release();
     }
   }
 
@@ -105,14 +110,16 @@ int main(int argc, char **argv) {
     exit(1);
   }
 
+  if (TextualOut)
+    M->print(Out->os(), nullptr, true);
+  else
 #if HAVE_LLVM >= 0x0700
-  WriteBitcodeToFile(*M, Out->os());
+    WriteBitcodeToFile(*M, Out->os());
 #else
-  WriteBitcodeToFile(M, Out->os());
+    WriteBitcodeToFile(M, Out->os());
 #endif
 
   // Declare success.
   Out->keep();
   return 0;
 }
-
diff --git a/libcxx/benchmarks/ContainerBenchmarks.h b/libcxx/benchmarks/ContainerBenchmarks.h
index 071e46c2a1c654..9a9abfd3b0d0f6 100644
--- a/libcxx/benchmarks/ContainerBenchmarks.h
+++ b/libcxx/benchmarks/ContainerBenchmarks.h
@@ -79,6 +79,19 @@ void BM_ConstructFromRange(benchmark::State& st, Container, GenInputs gen) {
   }
 }
 
+template <class Container>
+void BM_Pushback(benchmark::State& state, Container c) {
+  int count = state.range(0);
+  c.reserve(count);
+  while (state.KeepRunningBatch(count)) {
+    c.clear();
+    for (int i = 0; i != count; ++i) {
+      c.push_back(i);
+    }
+    benchmark::DoNotOptimize(c.data());
+  }
+}
+
 template <class Container, class GenInputs>
 void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
   auto in        = gen(st.range(0));
diff --git a/libcxx/benchmarks/vector_operations.bench.cpp b/libcxx/benchmarks/vector_operations.bench.cpp
index be0bee6b645612..38b14c56756fba 100644
--- a/libcxx/benchmarks/vector_operations.bench.cpp
+++ b/libcxx/benchmarks/vector_operations.bench.cpp
@@ -39,4 +39,6 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_size_t, std::vector<size_t>{}, g
 BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_string, std::vector<std::string>{}, getRandomStringInputs)
     ->Arg(TestNumInputs);
 
+BENCHMARK_CAPTURE(BM_Pushback, vector_int, std::vector<int>{})->Arg(TestNumInputs);
+
 BENCHMARK_MAIN();
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 0a5e99984fa75e..138305bb875aaa 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -52,6 +52,7 @@ Implemented Papers
 - P2697R1 - Interfacing ``bitset`` with ``string_view``
 - P2443R1 - ``views::chunk_by``
 - P2538R1 - ADL-proof ``std::projected``
+- P2614R2 - Deprecate ``numeric_limits::has_denorm``
 
 
 Improvements and New Features
@@ -87,6 +88,9 @@ Deprecations and Removals
   warning). ``_LIBCPP_ENABLE_ASSERTIONS`` will be removed entirely in the next release and setting it will become an
   error. See :ref:`the hardening documentation <using-hardening-modes>` for more details.
 
+- The non-conforming constructor ``std::future_error(std::error_code)`` has been removed. Please use the
+  ``std::future_error(std::future_errc)`` constructor provided in C++17 instead.
+
 Upcoming Deprecations and Removals
 ----------------------------------
 
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index 9cb49fd5176ea5..35319fb7576d3c 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -117,7 +117,7 @@
 "`P2655R3 <https://wg21.link/P2655R3>`__","LWG", "``common_reference_t`` of ``reference_wrapper`` Should Be a Reference Type","February 2023","","",""
 "`P2652R2 <https://wg21.link/P2652R2>`__","LWG", "Disallow User Specialization of ``allocator_traits``","February 2023","","",""
 "`P2787R1 <https://wg21.link/P2787R1>`__","LWG", "``pmr::generator`` - Promise Types are not Values","February 2023","","",""
-"`P2614R2 <https://wg21.link/P2614R2>`__","LWG", "Deprecate ``numeric_limits::has_denorm``","February 2023","","",""
+"`P2614R2 <https://wg21.link/P2614R2>`__","LWG", "Deprecate ``numeric_limits::has_denorm``","February 2023","|Complete|","18.0",""
 "`P2588R3 <https://wg21.link/P2588R3>`__","LWG", "``barrier``’s phase completion guarantees","February 2023","","",""
 "`P2763R1 <https://wg21.link/P2763R1>`__","LWG", "``layout_stride`` static extents default constructor fix","February 2023","","",""
 "`P2736R2 <https://wg21.link/P2736R2>`__","CWG","Referencing The Unicode Standard","February 2023","","","|format|"
diff --git a/libcxx/docs/Status/RangesViews.csv b/libcxx/docs/Status/RangesViews.csv
index 371b01ae6920f6..35c5dad6dcdef1 100644
--- a/libcxx/docs/Status/RangesViews.csv
+++ b/libcxx/docs/Status/RangesViews.csv
@@ -29,7 +29,7 @@ C++23,`zip_transform <https://wg21.link/P2321R2>`_,Hui Xie,No patch yet,Not star
 C++23,`adjacent <https://wg21.link/P2321R2>`_,Hui Xie,No patch yet,Not started
 C++23,`adjacent_transform <https://wg21.link/P2321R2>`_,Hui Xie,No patch yet,Not started
 C++23,`join_with <https://wg21.link/P2441R2>`_,Unassigned,No patch yet,Not started
-C++23,`slide <https://wg21.link/P2442R1>`_,Unassigned,No patch yet,Not started
+C++23,`slide <https://wg21.link/P2442R1>`_,Will Hawkins,`67146 <https://github.com/llvm/llvm-project/pull/67146>`_,In Progress
 C++23,`chunk <https://wg21.link/P2442R1>`_,Unassigned,No patch yet,Not started
 C++23,`chunk_by <https://wg21.link/P2443R1>`_,Jakub Mazurkiewicz,`D144767 <https://llvm.org/D144767>`_,✅
 C++23,`as_const <https://wg21.link/P2278R4>`_,Unassigned,No patch yet,Not started
diff --git a/libcxx/include/__algorithm/pstl_any_all_none_of.h b/libcxx/include/__algorithm/pstl_any_all_none_of.h
index b4b063ca4dda75..47d280c4314938 100644
--- a/libcxx/include/__algorithm/pstl_any_all_none_of.h
+++ b/libcxx/include/__algorithm/pstl_any_all_none_of.h
@@ -39,7 +39,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool
 any_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_any_of),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_any_of, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, _Predicate __g_pred) {
         return std::find_if(__policy, __g_first, __g_last, __g_pred) != __g_last;
       },
@@ -60,7 +60,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool
 all_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Pred __pred) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_all_of),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_all_of, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, _Pred __g_pred) {
         return !std::any_of(__policy, __g_first, __g_last, [&](__iter_reference<_ForwardIterator> __value) {
           return !__g_pred(__value);
@@ -83,7 +83,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool
 none_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Pred __pred) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_none_of),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_none_of, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, _Pred __g_pred) {
         return !std::any_of(__policy, __g_first, __g_last, __g_pred);
       },
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 93372f019031b6..3d9459ef5fa23a 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -35,11 +35,29 @@ A PSTL parallel backend is a tag type to which the following functions are assoc
   template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
   void __pstl_stable_sort(_Backend, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp);
 
+  template <class _ExecutionPolicy,
+            class _ForwardIterator1,
+            class _ForwardIterator2,
+            class _ForwardOutIterator,
+            class _Comp>
+  _ForwardOutIterator __pstl_merge(_Backend,
+                                   _ForwardIterator1 __first1,
+                                   _ForwardIterator1 __last1,
+                                   _ForwardIterator2 __first2,
+                                   _ForwardIterator2 __last2,
+                                   _ForwardOutIterator __result,
+                                   _Comp __comp);
+
   template <class _ExecutionPolicy, class _InIterator, class _OutIterator, class _UnaryOperation>
-  _OutIterator __pstl_transform(_InIterator __first, _InIterator __last, _OutIterator __result, _UnaryOperation __op);
+  _OutIterator __pstl_transform(_Backend,
+                                _InIterator __first,
+                                _InIterator __last,
+                                _OutIterator __result,
+                                _UnaryOperation __op);
 
   template <class _ExecutionPolicy, class _InIterator1, class _InIterator2, class _OutIterator, class _BinaryOperation>
-  _OutIterator __pstl_transform(_InIterator1 __first1,
+  _OutIterator __pstl_transform(_Backend,
+                                _InIterator1 __first1,
                                 _InIterator1 __last1,
                                 _InIterator2 __first2,
                                 _OutIterator __result,
diff --git a/libcxx/include/__algorithm/pstl_copy.h b/libcxx/include/__algorithm/pstl_copy.h
index 35d995a4719f61..e4a6e5a54e48e7 100644
--- a/libcxx/include/__algorithm/pstl_copy.h
+++ b/libcxx/include/__algorithm/pstl_copy.h
@@ -44,7 +44,7 @@ template <class _ExecutionPolicy,
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator
 copy(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _ForwardOutIterator __result) {
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_copy),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_copy, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _ForwardOutIterator __g_result) {
         return std::transform(__policy, __g_first, __g_last, __g_result, __identity());
       },
@@ -65,7 +65,7 @@ template <class _ExecutionPolicy,
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator
 copy_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __n, _ForwardOutIterator __result) {
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_copy_n),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_copy_n, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _Size __g_n, _ForwardOutIterator __g_result) {
         if constexpr (__has_random_access_iterator_category_or_concept<_ForwardIterator>::value)
           return std::copy(__policy, __g_first, __g_first + __g_n, __g_result);
diff --git a/libcxx/include/__algorithm/pstl_count.h b/libcxx/include/__algorithm/pstl_count.h
index 15acb85de0bff6..cc1e82457006cf 100644
--- a/libcxx/include/__algorithm/pstl_count.h
+++ b/libcxx/include/__algorithm/pstl_count.h
@@ -45,7 +45,7 @@ _LIBCPP_HIDE_FROM_ABI __iter_diff_t<_ForwardIterator>
 count_if(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   using __diff_t = __iter_diff_t<_ForwardIterator>;
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_count_if),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_count_if, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, _Predicate __g_pred) {
         return std::transform_reduce(
             __policy,
@@ -71,7 +71,7 @@ template <class _ExecutionPolicy,
 _LIBCPP_HIDE_FROM_ABI __iter_diff_t<_ForwardIterator>
 count(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_count),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_count, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) {
         return std::count_if(__policy, __g_first, __g_last, [&](__iter_reference<_ForwardIterator> __v) {
           return __v == __g_value;
diff --git a/libcxx/include/__algorithm/pstl_fill.h b/libcxx/include/__algorithm/pstl_fill.h
index 3d914131e0591f..fc817b5c9e6ea7 100644
--- a/libcxx/include/__algorithm/pstl_fill.h
+++ b/libcxx/include/__algorithm/pstl_fill.h
@@ -42,7 +42,7 @@ _LIBCPP_HIDE_FROM_ABI void
 fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) {
         std::for_each(__policy, __g_first, __g_last, [&](__iter_reference<_ForwardIterator> __element) {
           __element = __g_value;
@@ -66,7 +66,7 @@ _LIBCPP_HIDE_FROM_ABI void
 fill_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _SizeT __n, const _Tp& __value) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill_n),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill_n, _RawPolicy),
       [&](_ForwardIterator __g_first, _SizeT __g_n, const _Tp& __g_value) {
         if constexpr (__has_random_access_iterator_category_or_concept<_ForwardIterator>::value)
           std::fill(__policy, __g_first, __g_first + __g_n, __g_value);
diff --git a/libcxx/include/__algorithm/pstl_find.h b/libcxx/include/__algorithm/pstl_find.h
index 425d53ff16babc..c2894d08752ef2 100644
--- a/libcxx/include/__algorithm/pstl_find.h
+++ b/libcxx/include/__algorithm/pstl_find.h
@@ -53,7 +53,7 @@ _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 find_if_not(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_find_if_not),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_find_if_not, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, _Predicate __g_pred) {
         return std::find_if(__policy, __g_first, __g_last, [&](__iter_reference<_ForwardIterator> __value) {
           return !__g_pred(__value);
@@ -76,7 +76,7 @@ _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 find(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_find),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_find, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) {
         return std::find_if(__policy, __g_first, __g_last, [&](__iter_reference<_ForwardIterator> __element) {
           return __element == __g_value;
diff --git a/libcxx/include/__algorithm/pstl_for_each.h b/libcxx/include/__algorithm/pstl_for_each.h
index ec22e937aa803e..6e6c73d19f6fec 100644
--- a/libcxx/include/__algorithm/pstl_for_each.h
+++ b/libcxx/include/__algorithm/pstl_for_each.h
@@ -56,7 +56,7 @@ _LIBCPP_HIDE_FROM_ABI void
 for_each_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __size, _Function __func) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_for_each_n),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_for_each_n, _RawPolicy),
       [&](_ForwardIterator __g_first, _Size __g_size, _Function __g_func) {
         if constexpr (__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
           std::for_each(__policy, std::move(__g_first), __g_first + __g_size, std::move(__g_func));
diff --git a/libcxx/include/__algorithm/pstl_frontend_dispatch.h b/libcxx/include/__algorithm/pstl_frontend_dispatch.h
index dc49f3e5163a30..6fa11074911547 100644
--- a/libcxx/include/__algorithm/pstl_frontend_dispatch.h
+++ b/libcxx/include/__algorithm/pstl_frontend_dispatch.h
@@ -21,11 +21,10 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#  define _LIBCPP_PSTL_CUSTOMIZATION_POINT(name)                                                                       \
-    [](auto&&... __args) -> decltype(std::name<_RawPolicy>(typename __select_backend<_RawPolicy>::type{},              \
-                                                           std::forward<decltype(__args)>(__args)...)) {               \
-      return std::name<_RawPolicy>(                                                                                    \
-          typename __select_backend<_RawPolicy>::type{}, std::forward<decltype(__args)>(__args)...);                   \
+#  define _LIBCPP_PSTL_CUSTOMIZATION_POINT(name, policy)                                                               \
+    [](auto&&... __args) -> decltype(std::name<policy>(                                                                \
+                             typename __select_backend<policy>::type{}, std::forward<decltype(__args)>(__args)...)) {  \
+      return std::name<policy>(typename __select_backend<policy>::type{}, std::forward<decltype(__args)>(__args)...);  \
     }
 
 template <class _SpecializedImpl, class _GenericImpl, class... _Args>
diff --git a/libcxx/include/__algorithm/pstl_generate.h b/libcxx/include/__algorithm/pstl_generate.h
index 4c23c788bf08d2..9a70e2e26bbf97 100644
--- a/libcxx/include/__algorithm/pstl_generate.h
+++ b/libcxx/include/__algorithm/pstl_generate.h
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI void
 generate(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Generator __gen) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Generator __g_gen) {
         std::for_each(
             __policy, std::move(__g_first), std::move(__g_last), [&](__iter_reference<_ForwardIterator> __element) {
@@ -65,7 +65,7 @@ _LIBCPP_HIDE_FROM_ABI void
 generate_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __n, _Generator __gen) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate_n),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate_n, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _Size __g_n, _Generator __g_gen) {
         std::for_each_n(__policy, std::move(__g_first), __g_n, [&](__iter_reference<_ForwardIterator> __element) {
           __element = __g_gen();
diff --git a/libcxx/include/__algorithm/pstl_is_partitioned.h b/libcxx/include/__algorithm/pstl_is_partitioned.h
index 6f6e9b2f4478f9..1492ce212742d4 100644
--- a/libcxx/include/__algorithm/pstl_is_partitioned.h
+++ b/libcxx/include/__algorithm/pstl_is_partitioned.h
@@ -38,7 +38,7 @@ template <class _ExecutionPolicy,
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool
 is_partitioned(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_is_partitioned),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_is_partitioned, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Predicate __g_pred) {
         __g_first = std::find_if_not(__policy, __g_first, __g_last, __g_pred);
         if (__g_first == __g_last)
diff --git a/libcxx/include/__algorithm/pstl_replace.h b/libcxx/include/__algorithm/pstl_replace.h
index 04ffaaba596ae4..08f59ce2deab85 100644
--- a/libcxx/include/__algorithm/pstl_replace.h
+++ b/libcxx/include/__algorithm/pstl_replace.h
@@ -43,7 +43,7 @@ replace_if(_ExecutionPolicy&& __policy,
            _Pred __pred,
            const _Tp& __new_value) {
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace_if),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace_if, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Pred __g_pred, const _Tp& __g_new_value) {
         std::for_each(__policy, __g_first, __g_last, [&](__iter_reference<_ForwardIterator> __element) {
           if (__g_pred(__element))
@@ -71,7 +71,7 @@ replace(_ExecutionPolicy&& __policy,
         const _Tp& __old_value,
         const _Tp& __new_value) {
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace, _RawPolicy),
       [&__policy](
           _ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_old_value, const _Tp& __g_new_value) {
         std::replace_if(
@@ -105,7 +105,7 @@ _LIBCPP_HIDE_FROM_ABI void replace_copy_if(
     _Pred __pred,
     const _Tp& __new_value) {
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace_copy_if),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace_copy_if, _RawPolicy),
       [&__policy](_ForwardIterator __g_first,
                   _ForwardIterator __g_last,
                   _ForwardOutIterator __g_result,
@@ -139,7 +139,7 @@ _LIBCPP_HIDE_FROM_ABI void replace_copy(
     const _Tp& __old_value,
     const _Tp& __new_value) {
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace_copy),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace_copy, _RawPolicy),
       [&__policy](_ForwardIterator __g_first,
                   _ForwardIterator __g_last,
                   _ForwardOutIterator __g_result,
diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h
index 75c77ed405275d..85239df0abebe3 100644
--- a/libcxx/include/__algorithm/pstl_sort.h
+++ b/libcxx/include/__algorithm/pstl_sort.h
@@ -38,7 +38,7 @@ template <class _ExecutionPolicy,
 _LIBCPP_HIDE_FROM_ABI void
 sort(_ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_sort),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_sort, _RawPolicy),
       [&__policy](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) {
         std::stable_sort(__policy, std::move(__g_first), std::move(__g_last), std::move(__g_comp));
       },
diff --git a/libcxx/include/__algorithm/unwrap_range.h b/libcxx/include/__algorithm/unwrap_range.h
index 2c75c8f49de938..053fd550b302ee 100644
--- a/libcxx/include/__algorithm/unwrap_range.h
+++ b/libcxx/include/__algorithm/unwrap_range.h
@@ -50,7 +50,7 @@ struct __unwrap_range_impl {
   }
 
   _LIBCPP_HIDE_FROM_ABI static constexpr auto __rewrap(const _Iter&, _Iter __iter)
-    requires (!(random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>))
+    requires(!(random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>))
   {
     return __iter;
   }
@@ -73,10 +73,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr auto __unwrap_range(_Iter __first, _Sent __last)
   return __unwrap_range_impl<_Iter, _Sent>::__unwrap(std::move(__first), std::move(__last));
 }
 
-template <
-    class _Sent,
-    class _Iter,
-    class _Unwrapped = decltype(std::__unwrap_range(std::declval<_Iter>(), std::declval<_Sent>()))>
+template < class _Sent, class _Iter, class _Unwrapped>
 _LIBCPP_HIDE_FROM_ABI constexpr _Iter __rewrap_range(_Iter __orig_iter, _Unwrapped __iter) {
   return __unwrap_range_impl<_Iter, _Sent>::__rewrap(std::move(__orig_iter), std::move(__iter));
 }
@@ -86,7 +83,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR pair<_Unwrapped, _Unwrapped> __unwrap_ra
   return std::make_pair(std::__unwrap_iter(std::move(__first)), std::__unwrap_iter(std::move(__last)));
 }
 
-template <class _Iter, class _Unwrapped = decltype(std::__unwrap_iter(std::declval<_Iter>()))>
+template <class _Iter, class _Unwrapped>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __rewrap_range(_Iter __orig_iter, _Unwrapped __iter) {
   return std::__rewrap_iter(std::move(__orig_iter), std::move(__iter));
 }
diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index 579698ec1e932f..a6367945edc7c6 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -179,6 +179,17 @@
 // #   define _LIBCPP_AVAILABILITY_HAS_NO_TZDB
 #   define _LIBCPP_AVAILABILITY_TZDB
 
+// Enable additional explicit instantiations of iostreams components. This
+// reduces the number of weak definitions generated in programs that use
+// iostreams by providing a single strong definition in the shared library.
+//
+// TODO: Enable additional explicit instantiations on GCC once it supports exclude_from_explicit_instantiation,
+//       or once libc++ doesn't use the attribute anymore.
+// TODO: Enable them on Windows once https://llvm.org/PR41018 has been fixed.
+#if defined(_LIBCPP_COMPILER_GCC) || defined(_WIN32)
+#  define _LIBCPP_AVAILABILITY_HAS_NO_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1
+#endif
+
 #elif defined(__APPLE__)
 
     // shared_mutex and shared_timed_mutex
@@ -356,6 +367,12 @@
 #  define _LIBCPP_AVAILABILITY_HAS_NO_TZDB
 #  define _LIBCPP_AVAILABILITY_TZDB __attribute__((unavailable))
 
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 120000)   || \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 150000) || \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 150000)         || \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80000)
+#    define _LIBCPP_AVAILABILITY_HAS_NO_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1
+#  endif
 #else
 
 // ...New vendors can add availability markup here...
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 52bf12f80a28b9..cddc39026f155d 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -193,11 +193,6 @@
 #  endif
 
 #  if defined(_LIBCPP_BUILDING_LIBRARY) || _LIBCPP_ABI_VERSION >= 2
-// Enable additional explicit instantiations of iostreams components. This
-// reduces the number of weak definitions generated in programs that use
-// iostreams by providing a single strong definition in the shared library.
-#    define _LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1
-
 // Define a key function for `bad_function_call` in the library, to centralize
 // its vtable and typeinfo to libc++ rather than having all other libraries
 // using that class define their own copies.
diff --git a/libcxx/include/__functional/bind_back.h b/libcxx/include/__functional/bind_back.h
index 71dc63c86bdbee..0dd2befb524094 100644
--- a/libcxx/include/__functional/bind_back.h
+++ b/libcxx/include/__functional/bind_back.h
@@ -43,14 +43,9 @@ struct __bind_back_t : __perfect_forward<__bind_back_op<tuple_size_v<_BoundArgs>
     using __perfect_forward<__bind_back_op<tuple_size_v<_BoundArgs>>, _Fn, _BoundArgs>::__perfect_forward;
 };
 
-template <class _Fn, class ..._Args, class = enable_if_t<
-    _And<
-        is_constructible<decay_t<_Fn>, _Fn>,
-        is_move_constructible<decay_t<_Fn>>,
-        is_constructible<decay_t<_Args>, _Args>...,
-        is_move_constructible<decay_t<_Args>>...
-    >::value
->>
+template <class _Fn, class... _Args>
+  requires is_constructible_v<decay_t<_Fn>, _Fn> && is_move_constructible_v<decay_t<_Fn>> &&
+           (is_constructible_v<decay_t<_Args>, _Args> && ...) && (is_move_constructible_v<decay_t<_Args>> && ...)
 _LIBCPP_HIDE_FROM_ABI
 constexpr auto __bind_back(_Fn&& __f, _Args&&... __args)
     noexcept(noexcept(__bind_back_t<decay_t<_Fn>, tuple<decay_t<_Args>...>>(_VSTD::forward<_Fn>(__f), _VSTD::forward_as_tuple(_VSTD::forward<_Args>(__args)...))))
diff --git a/libcxx/include/__functional/bind_front.h b/libcxx/include/__functional/bind_front.h
index 72bb6648095962..7ccd6e563b6e19 100644
--- a/libcxx/include/__functional/bind_front.h
+++ b/libcxx/include/__functional/bind_front.h
@@ -42,14 +42,9 @@ struct __bind_front_t : __perfect_forward<__bind_front_op, _Fn, _BoundArgs...> {
     using __perfect_forward<__bind_front_op, _Fn, _BoundArgs...>::__perfect_forward;
 };
 
-template <class _Fn, class... _Args, class = enable_if_t<
-    _And<
-        is_constructible<decay_t<_Fn>, _Fn>,
-        is_move_constructible<decay_t<_Fn>>,
-        is_constructible<decay_t<_Args>, _Args>...,
-        is_move_constructible<decay_t<_Args>>...
-    >::value
->>
+template <class _Fn, class... _Args>
+  requires is_constructible_v<decay_t<_Fn>, _Fn> && is_move_constructible_v<decay_t<_Fn>> &&
+           (is_constructible_v<decay_t<_Args>, _Args> && ...) && (is_move_constructible_v<decay_t<_Args>> && ...)
 _LIBCPP_HIDE_FROM_ABI
 constexpr auto bind_front(_Fn&& __f, _Args&&... __args) {
     return __bind_front_t<decay_t<_Fn>, decay_t<_Args>...>(_VSTD::forward<_Fn>(__f), _VSTD::forward<_Args>(__args)...);
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 98337abe558330..1732c82178568c 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -21,6 +21,7 @@
 #include <__memory/addressof.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/compressed_pair.h>
+#include <__memory/construct_at.h>
 #include <__memory/pointer_traits.h>
 #include <__memory/swap_allocator.h>
 #include <__memory/unique_ptr.h>
@@ -45,6 +46,7 @@
 #include <cmath>
 #include <cstring>
 #include <initializer_list>
+#include <new> // __launder
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -107,19 +109,44 @@ struct __hash_node_base
     }
 
     _LIBCPP_INLINE_VISIBILITY __hash_node_base() _NOEXCEPT : __next_(nullptr) {}
+    _LIBCPP_HIDE_FROM_ABI explicit __hash_node_base(__next_pointer __next) _NOEXCEPT : __next_(__next) {}
 };
 
 template <class _Tp, class _VoidPtr>
-struct _LIBCPP_STANDALONE_DEBUG __hash_node
+struct __hash_node
     : public __hash_node_base
              <
                  __rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> >
              >
 {
     typedef _Tp __node_value_type;
+    using _Base = __hash_node_base<__rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > >;
+    using __next_pointer = typename _Base::__next_pointer;
 
     size_t            __hash_;
-    __node_value_type __value_;
+
+    // We allow starting the lifetime of nodes without initializing the value held by the node,
+    // since that is handled by the hash table itself in order to be allocator-aware.
+#ifndef _LIBCPP_CXX03_LANG
+private:
+    union {
+        _Tp __value_;
+    };
+
+public:
+    _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
+#else
+private:
+    _ALIGNAS_TYPE(_Tp) char __buffer_[sizeof(_Tp)];
+
+public:
+    _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() {
+        return *std::__launder(reinterpret_cast<_Tp*>(&__buffer_));
+    }
+#endif
+
+    _LIBCPP_HIDE_FROM_ABI explicit __hash_node(__next_pointer __next, size_t __hash) : _Base(__next), __hash_(__hash) {}
+    _LIBCPP_HIDE_FROM_ABI ~__hash_node() {}
 };
 
 inline _LIBCPP_INLINE_VISIBILITY
@@ -311,12 +338,12 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const {
-        return __node_->__upcast()->__value_;
+        return __node_->__upcast()->__get_value();
     }
 
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const {
-        return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__value_);
+        return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
     }
 
     _LIBCPP_INLINE_VISIBILITY
@@ -387,11 +414,11 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const {
-        return __node_->__upcast()->__value_;
+        return __node_->__upcast()->__get_value();
     }
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const {
-        return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__value_);
+        return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
     }
 
     _LIBCPP_INLINE_VISIBILITY
@@ -453,12 +480,12 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const {
-        return __node_->__upcast()->__value_;
+        return __node_->__upcast()->__get_value();
     }
 
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const {
-        return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__value_);
+        return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
     }
 
     _LIBCPP_INLINE_VISIBILITY
@@ -543,12 +570,12 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const {
-        return __node_->__upcast()->__value_;
+        return __node_->__upcast()->__get_value();
     }
 
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const {
-        return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__value_);
+        return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
     }
 
     _LIBCPP_INLINE_VISIBILITY
@@ -670,8 +697,10 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     void operator()(pointer __p) _NOEXCEPT
     {
-        if (__value_constructed)
-            __alloc_traits::destroy(__na_, _NodeTypes::__get_ptr(__p->__value_));
+        if (__value_constructed) {
+            __alloc_traits::destroy(__na_, _NodeTypes::__get_ptr(__p->__get_value()));
+            std::__destroy_at(std::addressof(*__p));
+        }
         if (__p)
             __alloc_traits::deallocate(__na_, __p, 1);
     }
@@ -1365,7 +1394,8 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer __np)
     {
         __next_pointer __next = __np->__next_;
         __node_pointer __real_np = __np->__upcast();
-        __node_traits::destroy(__na, _NodeTypes::__get_ptr(__real_np->__value_));
+        __node_traits::destroy(__na, _NodeTypes::__get_ptr(__real_np->__get_value()));
+        std::__destroy_at(std::addressof(*__real_np));
         __node_traits::deallocate(__na, __real_np, 1);
         __np = __next;
     }
@@ -1434,8 +1464,8 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(
                 const_iterator __i = __u.begin();
                 while (__cache != nullptr && __u.size() != 0)
                 {
-                    __cache->__upcast()->__value_ =
-                        _VSTD::move(__u.remove(__i++)->__value_);
+                    __cache->__upcast()->__get_value() =
+                        _VSTD::move(__u.remove(__i++)->__get_value());
                     __next_pointer __next = __cache->__next_;
                     __node_insert_multi(__cache->__upcast());
                     __cache = __next;
@@ -1453,7 +1483,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(
         const_iterator __i = __u.begin();
         while (__u.size() != 0)
         {
-            __node_holder __h = __construct_node(_NodeTypes::__move(__u.remove(__i++)->__value_));
+            __node_holder __h = __construct_node(_NodeTypes::__move(__u.remove(__i++)->__get_value()));
             __node_insert_multi(__h.get());
             __h.release();
         }
@@ -1495,7 +1525,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_unique(_InputIterator __first
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
             for (; __cache != nullptr && __first != __last; ++__first)
             {
-                __cache->__upcast()->__value_ = *__first;
+                __cache->__upcast()->__get_value() = *__first;
                 __next_pointer __next = __cache->__next_;
                 __node_insert_unique(__cache->__upcast());
                 __cache = __next;
@@ -1535,7 +1565,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __first,
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
             for (; __cache != nullptr && __first != __last; ++__first)
             {
-                __cache->__upcast()->__value_ = *__first;
+                __cache->__upcast()->__get_value() = *__first;
                 __next_pointer __next = __cache->__next_;
                 __node_insert_multi(__cache->__upcast());
                 __cache = __next;
@@ -1629,7 +1659,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_unique_prepare(
                                                      __ndptr = __ndptr->__next_)
             {
                 if ((__ndptr->__hash() == __hash) &&
-                    key_eq()(__ndptr->__upcast()->__value_, __value))
+                    key_eq()(__ndptr->__upcast()->__get_value(), __value))
                     return __ndptr;
             }
         }
@@ -1678,9 +1708,9 @@ template <class _Tp, class _Hash, class _Equal, class _Alloc>
 pair<typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator, bool>
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_unique(__node_pointer __nd)
 {
-    __nd->__hash_ = hash_function()(__nd->__value_);
+    __nd->__hash_ = hash_function()(__nd->__get_value());
     __next_pointer __existing_node =
-        __node_insert_unique_prepare(__nd->__hash(), __nd->__value_);
+        __node_insert_unique_prepare(__nd->__hash(), __nd->__get_value());
 
     // Insert the node, unless it already exists in the container.
     bool __inserted = false;
@@ -1726,7 +1756,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi_prepare(
             //      false       true        set __found to true
             //      true        false       break
             if (__found != (__pn->__next_->__hash() == __cp_hash &&
-                            key_eq()(__pn->__next_->__upcast()->__value_, __cp_val)))
+                            key_eq()(__pn->__next_->__upcast()->__get_value(), __cp_val)))
             {
                 if (!__found)
                     __found = true;
@@ -1780,8 +1810,8 @@ template <class _Tp, class _Hash, class _Equal, class _Alloc>
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi(__node_pointer __cp)
 {
-    __cp->__hash_ = hash_function()(__cp->__value_);
-    __next_pointer __pn = __node_insert_multi_prepare(__cp->__hash(), __cp->__value_);
+    __cp->__hash_ = hash_function()(__cp->__get_value());
+    __next_pointer __pn = __node_insert_multi_prepare(__cp->__hash(), __cp->__get_value());
     __node_insert_multi_perform(__cp, __pn);
 
     return iterator(__cp->__ptr());
@@ -1792,7 +1822,7 @@ typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi(
         const_iterator __p, __node_pointer __cp)
 {
-    if (__p != end() && key_eq()(*__p, __cp->__value_))
+    if (__p != end() && key_eq()(*__p, __cp->__get_value()))
     {
         __next_pointer __np = __p.__node_;
         __cp->__hash_ = __np->__hash();
@@ -1839,7 +1869,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const&
                                                            __nd = __nd->__next_)
             {
                 if ((__nd->__hash() == __hash) &&
-                    key_eq()(__nd->__upcast()->__value_, __k))
+                    key_eq()(__nd->__upcast()->__get_value(), __k))
                     goto __done;
             }
         }
@@ -1983,9 +2013,9 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_handle_merge_unique(
          __it != __source.end();)
     {
         __node_pointer __src_ptr = __it.__node_->__upcast();
-        size_t __hash = hash_function()(__src_ptr->__value_);
+        size_t __hash = hash_function()(__src_ptr->__get_value());
         __next_pointer __existing_node =
-            __node_insert_unique_prepare(__hash, __src_ptr->__value_);
+            __node_insert_unique_prepare(__hash, __src_ptr->__get_value());
         auto __prev_iter = __it++;
         if (__existing_node == nullptr)
         {
@@ -2037,9 +2067,9 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_handle_merge_multi(
          __it != __source.end();)
     {
         __node_pointer __src_ptr = __it.__node_->__upcast();
-        size_t __src_hash = hash_function()(__src_ptr->__value_);
+        size_t __src_hash = hash_function()(__src_ptr->__get_value());
         __next_pointer __pn =
-            __node_insert_multi_prepare(__src_hash, __src_ptr->__value_);
+            __node_insert_multi_prepare(__src_hash, __src_ptr->__get_value());
         (void)__source.remove(__it++).release();
         __src_ptr->__hash_ = __src_hash;
         __node_insert_multi_perform(__src_ptr, __pn);
@@ -2113,8 +2143,8 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__do_rehash(size_type __nbc)
                         if _LIBCPP_CONSTEXPR_SINCE_CXX17 (!_UniqueKeys)
                         {
                             for (; __np->__next_ != nullptr &&
-                                   key_eq()(__cp->__upcast()->__value_,
-                                            __np->__next_->__upcast()->__value_);
+                                   key_eq()(__cp->__upcast()->__get_value(),
+                                            __np->__next_->__upcast()->__get_value());
                                                                __np = __np->__next_)
                                 ;
                         }
@@ -2148,7 +2178,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::find(const _Key& __k)
                                                            __nd = __nd->__next_)
             {
                 if ((__nd->__hash() == __hash)
-                    && key_eq()(__nd->__upcast()->__value_, __k))
+                    && key_eq()(__nd->__upcast()->__get_value(), __k))
                     return iterator(__nd);
             }
         }
@@ -2175,7 +2205,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::find(const _Key& __k) const
                                                            __nd = __nd->__next_)
             {
                 if ((__nd->__hash() == __hash)
-                    && key_eq()(__nd->__upcast()->__value_, __k))
+                    && key_eq()(__nd->__upcast()->__get_value(), __k))
                     return const_iterator(__nd);
             }
         }
@@ -2193,10 +2223,20 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node(_Args&& ...__args)
                   "Construct cannot be called with a hash value type");
     __node_allocator& __na = __node_alloc();
     __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
-    __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__value_), _VSTD::forward<_Args>(__args)...);
+
+    // Begin the lifetime of the node itself. Note that this doesn't begin the lifetime of the value
+    // held inside the node, since we need to use the allocator's construct() method for that.
+    //
+    // We don't use the allocator's construct() method to construct the node itself since the
+    // Cpp17FooInsertable named requirements don't require the allocator's construct() method
+    // to work on anything other than the value_type.
+    std::__construct_at(std::addressof(*__h), /* next = */nullptr, /* hash = */0);
+
+    // Now construct the value_type using the allocator's construct() method.
+    __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__get_value()), _VSTD::forward<_Args>(__args)...);
     __h.get_deleter().__value_constructed = true;
-    __h->__hash_ = hash_function()(__h->__value_);
-    __h->__next_ = nullptr;
+
+    __h->__hash_ = hash_function()(__h->__get_value());
     return __h;
 }
 
@@ -2210,12 +2250,11 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node_hash(
                   "Construct cannot be called with a hash value type");
     __node_allocator& __na = __node_alloc();
     __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
-    __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__value_),
+    std::__construct_at(std::addressof(*__h), /* next = */nullptr, /* hash = */__hash);
+    __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__get_value()),
                              _VSTD::forward<_First>(__f),
                              _VSTD::forward<_Rest>(__rest)...);
     __h.get_deleter().__value_constructed = true;
-    __h->__hash_ = __hash;
-    __h->__next_ = nullptr;
     return __h;
 }
 
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 6be2f22184590a..d9ddb8a17be273 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -1137,7 +1137,8 @@ struct __unbounded_array_control_block<_Tp[], _Alloc> : __shared_weak_count
         __alloc_.~_Alloc();
         size_t __size = __unbounded_array_control_block::__bytes_for(__count_);
         _AlignedStorage* __storage = reinterpret_cast<_AlignedStorage*>(this);
-        allocator_traits<_StorageAlloc>::deallocate(__tmp, _PointerTraits::pointer_to(*__storage), __size);
+        allocator_traits<_StorageAlloc>::deallocate(
+            __tmp, _PointerTraits::pointer_to(*__storage), __size / sizeof(_AlignedStorage));
     }
 
     _LIBCPP_NO_UNIQUE_ADDRESS _Alloc __alloc_;
@@ -1220,7 +1221,7 @@ struct __bounded_array_control_block<_Tp[_Count], _Alloc>
 
         _ControlBlockAlloc __tmp(__alloc_);
         __alloc_.~_Alloc();
-        allocator_traits<_ControlBlockAlloc>::deallocate(__tmp, _PointerTraits::pointer_to(*this), sizeof(*this));
+        allocator_traits<_ControlBlockAlloc>::deallocate(__tmp, _PointerTraits::pointer_to(*this), 1);
     }
 
     _LIBCPP_NO_UNIQUE_ADDRESS _Alloc __alloc_;
@@ -1727,11 +1728,11 @@ template<class _Yp, __enable_if_t<__compatible_with<_Yp, _Tp>::value, int> >
 inline
 weak_ptr<_Tp>::weak_ptr(weak_ptr<_Yp> const& __r)
          _NOEXCEPT
-    : __ptr_(__r.__ptr_),
-      __cntrl_(__r.__cntrl_)
+    : __ptr_(nullptr),
+      __cntrl_(nullptr)
 {
-    if (__cntrl_)
-        __cntrl_->__add_weak();
+    shared_ptr<_Yp> __s = __r.lock();
+    *this = weak_ptr<_Tp>(__s);
 }
 
 template<class _Tp>
@@ -1749,11 +1750,12 @@ template<class _Yp, __enable_if_t<__compatible_with<_Yp, _Tp>::value, int> >
 inline
 weak_ptr<_Tp>::weak_ptr(weak_ptr<_Yp>&& __r)
          _NOEXCEPT
-    : __ptr_(__r.__ptr_),
-      __cntrl_(__r.__cntrl_)
+    : __ptr_(nullptr),
+      __cntrl_(nullptr)
 {
-    __r.__ptr_ = nullptr;
-    __r.__cntrl_ = nullptr;
+    shared_ptr<_Yp> __s = __r.lock();
+    *this = weak_ptr<_Tp>(__s);
+    __r.reset();
 }
 
 template<class _Tp>
diff --git a/libcxx/include/__node_handle b/libcxx/include/__node_handle
index cc4eaf73c0bbea..b3cc3619dd5ad5 100644
--- a/libcxx/include/__node_handle
+++ b/libcxx/include/__node_handle
@@ -209,7 +209,7 @@ struct __set_node_handle_specifics
     _LIBCPP_INLINE_VISIBILITY
     value_type& value() const
     {
-        return static_cast<_Derived const*>(this)->__ptr_->__value_;
+        return static_cast<_Derived const*>(this)->__ptr_->__get_value();
     }
 };
 
@@ -223,14 +223,14 @@ struct __map_node_handle_specifics
     key_type& key() const
     {
         return static_cast<_Derived const*>(this)->
-            __ptr_->__value_.__ref().first;
+            __ptr_->__get_value().__ref().first;
     }
 
     _LIBCPP_INLINE_VISIBILITY
     mapped_type& mapped() const
     {
         return static_cast<_Derived const*>(this)->
-            __ptr_->__value_.__ref().second;
+            __ptr_->__get_value().__ref().second;
     }
 };
 
diff --git a/libcxx/include/__numeric/pstl_reduce.h b/libcxx/include/__numeric/pstl_reduce.h
index 163e0078e10e5c..22ef2707d7d3ee 100644
--- a/libcxx/include/__numeric/pstl_reduce.h
+++ b/libcxx/include/__numeric/pstl_reduce.h
@@ -40,7 +40,7 @@ reduce(_ExecutionPolicy&& __policy,
        _Tp __init,
        _BinaryOperation __op = {}) {
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_reduce),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_reduce, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Tp __g_init, _BinaryOperation __g_op) {
         return std::transform_reduce(
             __policy, std::move(__g_first), std::move(__g_last), std::move(__g_init), std::move(__g_op), __identity{});
@@ -58,7 +58,7 @@ template <class _ExecutionPolicy,
 _LIBCPP_HIDE_FROM_ABI __iter_value_type<_ForwardIterator>
 reduce(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last) {
   return std::__pstl_frontend_dispatch(
-      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_reduce),
+      _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_reduce, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last) {
         return std::reduce(__policy, __g_first, __g_last, __iter_value_type<_ForwardIterator>());
       },
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index f372688abfdaf3..ccf0c7a8e8d50e 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -363,15 +363,14 @@ namespace ranges {
               (integral<_Start> && integral<_BoundSentinel>) || sized_sentinel_for<_BoundSentinel, _Start>
     {
       if constexpr (__integer_like<_Start> && __integer_like<_BoundSentinel>) {
-        if (__value_ < 0) {
-          if (__bound_sentinel_ < 0) {
-            return std::__to_unsigned_like(-__value_) - std::__to_unsigned_like(-__bound_sentinel_);
-          }
-          return std::__to_unsigned_like(__bound_sentinel_) + std::__to_unsigned_like(-__value_);
-        }
-        return std::__to_unsigned_like(__bound_sentinel_) - std::__to_unsigned_like(__value_);
+        return (__value_ < 0)
+                 ? ((__bound_sentinel_ < 0)
+                        ? std::__to_unsigned_like(-__value_) - std::__to_unsigned_like(-__bound_sentinel_)
+                        : std::__to_unsigned_like(__bound_sentinel_) + std::__to_unsigned_like(-__value_))
+                 : std::__to_unsigned_like(__bound_sentinel_) - std::__to_unsigned_like(__value_);
+      } else {
+        return std::__to_unsigned_like(__bound_sentinel_ - __value_);
       }
-      return std::__to_unsigned_like(__bound_sentinel_ - __value_);
     }
   };
 
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 54ce71e442d037..eccadea8a01393 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -774,6 +774,8 @@ public:
 
     __node_value_type __value_;
 
+    _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
+
 private:
   ~__tree_node() = delete;
   __tree_node(__tree_node const&) = delete;
diff --git a/libcxx/include/ext/hash_map b/libcxx/include/ext/hash_map
index 116b6a72f2c120..de963675eb793c 100644
--- a/libcxx/include/ext/hash_map
+++ b/libcxx/include/ext/hash_map
@@ -357,9 +357,9 @@ public:
     void operator()(pointer __p)
     {
         if (__second_constructed)
-            __alloc_traits::destroy(__na_, _VSTD::addressof(__p->__value_.second));
+            __alloc_traits::destroy(__na_, _VSTD::addressof(__p->__get_value().second));
         if (__first_constructed)
-            __alloc_traits::destroy(__na_, _VSTD::addressof(__p->__value_.first));
+            __alloc_traits::destroy(__na_, _VSTD::addressof(__p->__get_value().first));
         if (__p)
             __alloc_traits::deallocate(__na_, __p, 1);
     }
@@ -667,9 +667,9 @@ hash_map<_Key, _Tp, _Hash, _Pred, _Alloc>::__construct_node(const key_type& __k)
 {
     __node_allocator& __na = __table_.__node_alloc();
     __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
-    __node_traits::construct(__na, _VSTD::addressof(__h->__value_.first), __k);
+    __node_traits::construct(__na, _VSTD::addressof(__h->__get_value().first), __k);
     __h.get_deleter().__first_constructed = true;
-    __node_traits::construct(__na, _VSTD::addressof(__h->__value_.second));
+    __node_traits::construct(__na, _VSTD::addressof(__h->__get_value().second));
     __h.get_deleter().__second_constructed = true;
     return __h;
 }
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 75ac685cc02839..09338ab6957134 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -211,6 +211,7 @@ template <class T, class Allocator, class Predicate>
 #include <__memory/allocator.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/compressed_pair.h>
+#include <__memory/construct_at.h>
 #include <__memory/pointer_traits.h>
 #include <__memory/swap_allocator.h>
 #include <__memory_resource/polymorphic_allocator.h>
@@ -230,6 +231,7 @@ template <class T, class Allocator, class Predicate>
 #include <__utility/forward.h>
 #include <__utility/move.h>
 #include <limits>
+#include <new> // __launder
 #include <version>
 
 // standard-mandated includes
@@ -318,17 +320,35 @@ template <class _Tp, class _VoidPtr>
 using __begin_node_of = __forward_begin_node<__rebind_pointer_t<_VoidPtr, __forward_list_node<_Tp, _VoidPtr> > >;
 
 template <class _Tp, class _VoidPtr>
-struct _LIBCPP_STANDALONE_DEBUG __forward_list_node
+struct __forward_list_node
     : public __begin_node_of<_Tp, _VoidPtr>
 {
     typedef _Tp value_type;
     typedef __begin_node_of<_Tp, _VoidPtr> _Base;
     typedef typename _Base::pointer _NodePtr;
 
-    value_type __value_;
+    // We allow starting the lifetime of nodes without initializing the value held by the node,
+    // since that is handled by the list itself in order to be allocator-aware.
+#ifndef _LIBCPP_CXX03_LANG
+private:
+    union {
+        _Tp __value_;
+    };
+
+public:
+    _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
+#else
+private:
+    _ALIGNAS_TYPE(_Tp) char __buffer_[sizeof(_Tp)];
+
+public:
+    _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() {
+        return *std::__launder(reinterpret_cast<_Tp*>(&__buffer_));
+    }
+#endif
 
-    _LIBCPP_HIDE_FROM_ABI __forward_list_node() = default;
-    _LIBCPP_HIDE_FROM_ABI __forward_list_node(const value_type& __v, _NodePtr __next) : _Base(__next), __value_(__v) {}
+    _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {}
+    _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {}
 };
 
 
@@ -383,10 +403,10 @@ public:
     __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {}
 
     _LIBCPP_INLINE_VISIBILITY
-    reference operator*() const {return __get_unsafe_node_pointer()->__value_;}
+    reference operator*() const {return __get_unsafe_node_pointer()->__get_value();}
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const {
-        return pointer_traits<pointer>::pointer_to(__get_unsafe_node_pointer()->__value_);
+        return pointer_traits<pointer>::pointer_to(__get_unsafe_node_pointer()->__get_value());
     }
 
     _LIBCPP_INLINE_VISIBILITY
@@ -468,10 +488,10 @@ public:
         : __ptr_(__p.__ptr_) {}
 
     _LIBCPP_INLINE_VISIBILITY
-    reference operator*() const {return __get_unsafe_node_pointer()->__value_;}
+    reference operator*() const {return __get_unsafe_node_pointer()->__get_value();}
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const {return pointer_traits<pointer>::pointer_to(
-                __get_unsafe_node_pointer()->__value_);}
+                __get_unsafe_node_pointer()->__get_value());}
 
     _LIBCPP_INLINE_VISIBILITY
     __forward_list_const_iterator& operator++()
@@ -577,15 +597,26 @@ protected:
     _LIBCPP_HIDE_FROM_ABI __node_pointer __create_node(__node_pointer __next, _Args&& ...__args) {
         __node_allocator& __a = __alloc();
         __allocation_guard<__node_allocator> __guard(__a, 1);
-        __guard.__get()->__next_ = __next;
-        __node_traits::construct(__a, std::addressof(__guard.__get()->__value_), std::forward<_Args>(__args)...);
+        // Begin the lifetime of the node itself. Note that this doesn't begin the lifetime of the value
+        // held inside the node, since we need to use the allocator's construct() method for that.
+        //
+        // We don't use the allocator's construct() method to construct the node itself since the
+        // Cpp17FooInsertable named requirements don't require the allocator's construct() method
+        // to work on anything other than the value_type.
+        std::__construct_at(std::addressof(*__guard.__get()), __next);
+
+        // Now construct the value_type using the allocator's construct() method.
+        __node_traits::construct(__a, std::addressof(__guard.__get()->__get_value()), std::forward<_Args>(__args)...);
         return __guard.__release_ptr();
     }
 
     template <class ..._Args>
     _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
+        // For the same reason as above, we use the allocator's destroy() method for the value_type,
+        // but not for the node itself.
         __node_allocator& __a = __alloc();
-        __node_traits::destroy(__a, std::addressof(__node->__value_));
+        __node_traits::destroy(__a, std::addressof(__node->__get_value()));
+        std::__destroy_at(std::addressof(*__node));
         __node_traits::deallocate(__a, __node, 1);
     }
 
@@ -847,9 +878,9 @@ public:
     }
 
     _LIBCPP_INLINE_VISIBILITY
-    reference       front()       {return base::__before_begin()->__next_->__value_;}
+    reference       front()       {return base::__before_begin()->__next_->__get_value();}
     _LIBCPP_INLINE_VISIBILITY
-    const_reference front() const {return base::__before_begin()->__next_->__value_;}
+    const_reference front() const {return base::__before_begin()->__next_->__get_value();}
 
 #ifndef _LIBCPP_CXX03_LANG
 #if _LIBCPP_STD_VER >= 17
@@ -1227,7 +1258,7 @@ forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args)
 {
     base::__before_begin()->__next_ = this->__create_node(/* next = */base::__before_begin()->__next_, std::forward<_Args>(__args)...);
 #if _LIBCPP_STD_VER >= 17
-    return base::__before_begin()->__next_->__value_;
+    return base::__before_begin()->__next_->__get_value();
 #endif
 }
 
@@ -1556,7 +1587,7 @@ forward_list<_Tp, _Alloc>::remove(const value_type& __v)
     const iterator __e = end();
     for (iterator __i = before_begin(); __i.__get_begin()->__next_ != nullptr;)
     {
-        if (__i.__get_begin()->__next_->__value_ == __v)
+        if (__i.__get_begin()->__next_->__get_value() == __v)
         {
             ++__count_removed;
             iterator __j = _VSTD::next(__i, 2);
@@ -1584,7 +1615,7 @@ forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred)
     const iterator __e = end();
     for (iterator __i = before_begin(); __i.__get_begin()->__next_ != nullptr;)
     {
-        if (__pred(__i.__get_begin()->__next_->__value_))
+        if (__pred(__i.__get_begin()->__next_->__get_value()))
         {
             ++__count_removed;
             iterator __j = _VSTD::next(__i, 2);
@@ -1647,11 +1678,11 @@ forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2,
     if (__f2 == nullptr)
         return __f1;
     __node_pointer __r;
-    if (__comp(__f2->__value_, __f1->__value_))
+    if (__comp(__f2->__get_value(), __f1->__get_value()))
     {
         __node_pointer __t = __f2;
         while (__t->__next_ != nullptr &&
-                             __comp(__t->__next_->__value_, __f1->__value_))
+                             __comp(__t->__next_->__get_value(), __f1->__get_value()))
             __t = __t->__next_;
         __r = __f2;
         __f2 = __t->__next_;
@@ -1663,11 +1694,11 @@ forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2,
     __f1 = __f1->__next_;
     while (__f1 != nullptr && __f2 != nullptr)
     {
-        if (__comp(__f2->__value_, __f1->__value_))
+        if (__comp(__f2->__get_value(), __f1->__get_value()))
         {
             __node_pointer __t = __f2;
             while (__t->__next_ != nullptr &&
-                                 __comp(__t->__next_->__value_, __f1->__value_))
+                                 __comp(__t->__next_->__get_value(), __f1->__get_value()))
                 __t = __t->__next_;
             __p->__next_ = __f2;
             __f2 = __t->__next_;
@@ -1703,7 +1734,7 @@ forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz,
     case 1:
         return __f1;
     case 2:
-        if (__comp(__f1->__next_->__value_, __f1->__value_))
+        if (__comp(__f1->__next_->__get_value(), __f1->__get_value()))
         {
             __node_pointer __t = __f1->__next_;
             __t->__next_ = __f1;
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index cf5ca142e7d958..024eef8a9d6698 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -1734,7 +1734,7 @@ basic_fstream<_CharT, _Traits>::close()
         this->setstate(ios_base::failbit);
 }
 
-#if defined(_LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1)
+#ifndef _LIBCPP_AVAILABILITY_HAS_NO_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1
 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ifstream<char>;
 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ofstream<char>;
 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_filebuf<char>;
diff --git a/libcxx/include/future b/libcxx/include/future
index 273e4175e604bd..f372b8e6ad5d8d 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -44,14 +44,15 @@ error_condition make_error_condition(future_errc e) noexcept;
 
 const error_category& future_category() noexcept;
 
-class future_error
-    : public logic_error
-{
+class future_error : public logic_error {
 public:
-    future_error(error_code ec);  // exposition only
-    explicit future_error(future_errc); // C++17
+    explicit future_error(future_errc e); // since C++17
+
     const error_code& code() const noexcept;
     const char*       what() const noexcept;
+
+private:
+    error_code ec_;             // exposition only
 };
 
 template <class R>
@@ -516,12 +517,25 @@ make_error_condition(future_errc __e) _NOEXCEPT
     return error_condition(static_cast<int>(__e), future_category());
 }
 
+_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+_LIBCPP_AVAILABILITY_FUTURE_ERROR
+#endif
+void __throw_future_error(future_errc __ev);
+
 class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_FUTURE_ERROR future_error
     : public logic_error
 {
     error_code __ec_;
+
+    future_error(error_code);
+    friend void __throw_future_error(future_errc);
+    template <class> friend class promise;
+
 public:
-    future_error(error_code __ec);
+#if _LIBCPP_STD_VER >= 17
+    _LIBCPP_HIDE_FROM_ABI explicit future_error(future_errc __ec) : future_error(std::make_error_code(__ec)) {}
+#endif
 
     _LIBCPP_INLINE_VISIBILITY
     const error_code& code() const _NOEXCEPT {return __ec_;}
@@ -530,10 +544,7 @@ public:
     ~future_error() _NOEXCEPT override;
 };
 
-_LIBCPP_NORETURN inline _LIBCPP_INLINE_VISIBILITY
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-_LIBCPP_AVAILABILITY_FUTURE_ERROR
-#endif
+// Declared above std::future_error
 void __throw_future_error(future_errc __ev)
 {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
@@ -1359,9 +1370,7 @@ promise<_Rp>::~promise()
     if (__state_)
     {
         if (!__state_->__has_value() && __state_->use_count() > 1)
-            __state_->set_exception(make_exception_ptr(
-                      future_error(make_error_code(future_errc::broken_promise))
-                                                      ));
+            __state_->set_exception(make_exception_ptr(future_error(make_error_code(future_errc::broken_promise))));
         __state_->__release_shared();
     }
 }
@@ -1502,9 +1511,7 @@ promise<_Rp&>::~promise()
     if (__state_)
     {
         if (!__state_->__has_value() && __state_->use_count() > 1)
-            __state_->set_exception(make_exception_ptr(
-                      future_error(make_error_code(future_errc::broken_promise))
-                                                      ));
+            __state_->set_exception(make_exception_ptr(future_error(make_error_code(future_errc::broken_promise))));
         __state_->__release_shared();
     }
 }
diff --git a/libcxx/include/limits b/libcxx/include/limits
index 9f5949e63cff01..51daee6c4968cb 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -43,8 +43,8 @@ public:
     static constexpr bool has_infinity = false;
     static constexpr bool has_quiet_NaN = false;
     static constexpr bool has_signaling_NaN = false;
-    static constexpr float_denorm_style has_denorm = denorm_absent;
-    static constexpr bool has_denorm_loss = false;
+    static constexpr float_denorm_style has_denorm = denorm_absent; // deprecated in C++23
+    static constexpr bool has_denorm_loss = false;                  // deprecated in C++23
     static constexpr T infinity() noexcept;
     static constexpr T quiet_NaN() noexcept;
     static constexpr T signaling_NaN() noexcept;
@@ -68,7 +68,7 @@ enum float_round_style
     round_toward_neg_infinity =  3
 };
 
-enum float_denorm_style
+enum float_denorm_style // deprecated in C++23
 {
     denorm_indeterminate = -1,
     denorm_absent = 0,
@@ -128,7 +128,7 @@ enum float_round_style
     round_toward_neg_infinity =  3
 };
 
-enum float_denorm_style
+enum _LIBCPP_DEPRECATED_IN_CXX23 float_denorm_style
 {
     denorm_indeterminate = -1,
     denorm_absent = 0,
@@ -164,8 +164,8 @@ protected:
     static _LIBCPP_CONSTEXPR const bool has_infinity = false;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = false;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = false;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return type();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return type();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return type();}
@@ -224,8 +224,8 @@ protected:
     static _LIBCPP_CONSTEXPR const bool has_infinity = false;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = false;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = false;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return type(0);}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return type(0);}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return type(0);}
@@ -277,8 +277,8 @@ protected:
     static _LIBCPP_CONSTEXPR const bool has_infinity = false;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = false;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = false;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return type(0);}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return type(0);}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return type(0);}
@@ -323,8 +323,8 @@ protected:
     static _LIBCPP_CONSTEXPR const bool has_infinity = true;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = true;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = true;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return __builtin_huge_valf();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return __builtin_nanf("");}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return __builtin_nansf("");}
@@ -373,8 +373,8 @@ protected:
     static _LIBCPP_CONSTEXPR const bool has_infinity = true;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = true;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = true;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return __builtin_huge_val();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return __builtin_nan("");}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return __builtin_nans("");}
@@ -423,8 +423,8 @@ protected:
     static _LIBCPP_CONSTEXPR const bool has_infinity = true;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = true;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = true;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false;
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return __builtin_huge_vall();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return __builtin_nanl("");}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return __builtin_nansl("");}
@@ -477,8 +477,10 @@ public:
     static _LIBCPP_CONSTEXPR const bool has_infinity = __base::has_infinity;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = __base::has_quiet_NaN;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss;
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss;
+_LIBCPP_SUPPRESS_DEPRECATED_POP
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return __base::infinity();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return __base::quiet_NaN();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return __base::signaling_NaN();}
@@ -570,8 +572,10 @@ public:
     static _LIBCPP_CONSTEXPR const bool has_infinity = __base::has_infinity;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = __base::has_quiet_NaN;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss;
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss;
+_LIBCPP_SUPPRESS_DEPRECATED_POP
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return __base::infinity();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return __base::quiet_NaN();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return __base::signaling_NaN();}
@@ -663,8 +667,10 @@ public:
     static _LIBCPP_CONSTEXPR const bool has_infinity = __base::has_infinity;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = __base::has_quiet_NaN;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss;
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss;
+_LIBCPP_SUPPRESS_DEPRECATED_POP
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return __base::infinity();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return __base::quiet_NaN();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return __base::signaling_NaN();}
@@ -756,8 +762,10 @@ public:
     static _LIBCPP_CONSTEXPR const bool has_infinity = __base::has_infinity;
     static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = __base::has_quiet_NaN;
     static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN;
-    static _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
-    static _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss;
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
+    static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss;
+_LIBCPP_SUPPRESS_DEPRECATED_POP
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT {return __base::infinity();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT {return __base::quiet_NaN();}
     _LIBCPP_INLINE_VISIBILITY static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT {return __base::signaling_NaN();}
diff --git a/libcxx/include/list b/libcxx/include/list
index b02599bc3fe7c3..e5b524b8835a11 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -217,6 +217,7 @@ template <class T, class Allocator, class Predicate>
 #include <__memory/allocator.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/compressed_pair.h>
+#include <__memory/construct_at.h>
 #include <__memory/pointer_traits.h>
 #include <__memory/swap_allocator.h>
 #include <__memory_resource/polymorphic_allocator.h>
@@ -237,6 +238,7 @@ template <class T, class Allocator, class Predicate>
 #include <__utility/swap.h>
 #include <cstring>
 #include <limits>
+#include <new> // __launder
 #include <version>
 
 // standard-mandated includes
@@ -308,6 +310,9 @@ struct __list_node_base
     __list_node_base() : __prev_(_NodeTraits::__unsafe_link_pointer_cast(__self())),
                          __next_(_NodeTraits::__unsafe_link_pointer_cast(__self())) {}
 
+    _LIBCPP_HIDE_FROM_ABI explicit __list_node_base(__link_pointer __prev, __link_pointer __next)
+        : __prev_(__prev), __next_(__next) {}
+
     _LIBCPP_INLINE_VISIBILITY
     __base_pointer __self() {
         return pointer_traits<__base_pointer>::pointer_to(*this);
@@ -320,14 +325,35 @@ struct __list_node_base
 };
 
 template <class _Tp, class _VoidPtr>
-struct _LIBCPP_STANDALONE_DEBUG __list_node
+struct __list_node
     : public __list_node_base<_Tp, _VoidPtr>
 {
-    _Tp __value_;
+    // We allow starting the lifetime of nodes without initializing the value held by the node,
+    // since that is handled by the list itself in order to be allocator-aware.
+#ifndef _LIBCPP_CXX03_LANG
+private:
+    union {
+        _Tp __value_;
+    };
+
+public:
+    _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
+#else
+private:
+    _ALIGNAS_TYPE(_Tp) char __buffer_[sizeof(_Tp)];
+
+public:
+    _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() {
+        return *std::__launder(reinterpret_cast<_Tp*>(&__buffer_));
+    }
+#endif
 
     typedef __list_node_base<_Tp, _VoidPtr> __base;
     typedef typename __base::__link_pointer __link_pointer;
 
+    _LIBCPP_HIDE_FROM_ABI explicit __list_node(__link_pointer __prev, __link_pointer __next) : __base(__prev, __next) {}
+    _LIBCPP_HIDE_FROM_ABI ~__list_node() {}
+
     _LIBCPP_INLINE_VISIBILITY
     __link_pointer __as_link() {
         return static_cast<__link_pointer>(__base::__self());
@@ -370,12 +396,12 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const
     {
-        return __ptr_->__as_node()->__value_;
+        return __ptr_->__as_node()->__get_value();
     }
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const
     {
-        return pointer_traits<pointer>::pointer_to(__ptr_->__as_node()->__value_);
+        return pointer_traits<pointer>::pointer_to(__ptr_->__as_node()->__get_value());
     }
 
     _LIBCPP_INLINE_VISIBILITY
@@ -442,12 +468,12 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     reference operator*() const
     {
-        return __ptr_->__as_node()->__value_;
+        return __ptr_->__as_node()->__get_value();
     }
     _LIBCPP_INLINE_VISIBILITY
     pointer operator->() const
     {
-        return pointer_traits<pointer>::pointer_to(__ptr_->__as_node()->__value_);
+        return pointer_traits<pointer>::pointer_to(__ptr_->__as_node()->__get_value());
     }
 
     _LIBCPP_INLINE_VISIBILITY
@@ -600,16 +626,26 @@ protected:
     _LIBCPP_HIDE_FROM_ABI __node_pointer __create_node(__link_pointer __prev, __link_pointer __next, _Args&& ...__args) {
         __node_allocator& __alloc = __node_alloc();
         __allocation_guard<__node_allocator> __guard(__alloc, 1);
-        __guard.__get()->__prev_ = __prev;
-        __guard.__get()->__next_ = __next;
-        __node_alloc_traits::construct(__alloc, std::addressof(__guard.__get()->__value_), std::forward<_Args>(__args)...);
+        // Begin the lifetime of the node itself. Note that this doesn't begin the lifetime of the value
+        // held inside the node, since we need to use the allocator's construct() method for that.
+        //
+        // We don't use the allocator's construct() method to construct the node itself since the
+        // Cpp17FooInsertable named requirements don't require the allocator's construct() method
+        // to work on anything other than the value_type.
+        std::__construct_at(std::addressof(*__guard.__get()), __prev, __next);
+
+        // Now construct the value_type using the allocator's construct() method.
+        __node_alloc_traits::construct(__alloc, std::addressof(__guard.__get()->__get_value()), std::forward<_Args>(__args)...);
         return __guard.__release_ptr();
     }
 
     template <class ..._Args>
     _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
+        // For the same reason as above, we use the allocator's destroy() method for the value_type,
+        // but not for the node itself.
         __node_allocator& __alloc = __node_alloc();
-        __node_alloc_traits::destroy(__alloc, std::addressof(__node->__value_));
+        __node_alloc_traits::destroy(__alloc, std::addressof(__node->__get_value()));
+        std::__destroy_at(std::addressof(*__node));
         __node_alloc_traits::deallocate(__alloc, __node, 1);
     }
 
@@ -894,25 +930,25 @@ public:
     reference front()
     {
         _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list");
-        return base::__end_.__next_->__as_node()->__value_;
+        return base::__end_.__next_->__as_node()->__get_value();
     }
     _LIBCPP_INLINE_VISIBILITY
     const_reference front() const
     {
         _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list");
-        return base::__end_.__next_->__as_node()->__value_;
+        return base::__end_.__next_->__as_node()->__get_value();
     }
     _LIBCPP_INLINE_VISIBILITY
     reference back()
     {
         _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list");
-        return base::__end_.__prev_->__as_node()->__value_;
+        return base::__end_.__prev_->__as_node()->__get_value();
     }
     _LIBCPP_INLINE_VISIBILITY
     const_reference back() const
     {
         _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list");
-        return base::__end_.__prev_->__as_node()->__value_;
+        return base::__end_.__prev_->__as_node()->__get_value();
     }
 
 #ifndef _LIBCPP_CXX03_LANG
@@ -1502,7 +1538,7 @@ list<_Tp, _Alloc>::emplace_front(_Args&&... __args)
     __link_nodes_at_front(__nl, __nl);
     ++base::__sz();
 #if _LIBCPP_STD_VER >= 17
-    return __node->__value_;
+    return __node->__get_value();
 #endif
 }
 
@@ -1520,7 +1556,7 @@ list<_Tp, _Alloc>::emplace_back(_Args&&... __args)
     __link_nodes_at_back(__nl, __nl);
     ++base::__sz();
 #if _LIBCPP_STD_VER >= 17
-    return __node->__value_;
+    return __node->__get_value();
 #endif
 }
 
diff --git a/libcxx/include/math.h b/libcxx/include/math.h
index fdf3231c2978ce..b068ea388f095b 100644
--- a/libcxx/include/math.h
+++ b/libcxx/include/math.h
@@ -387,8 +387,19 @@ namespace __math {
 
 // fpclassify
 
-template <class _A1, std::__enable_if_t<std::is_floating_point<_A1>::value, int> = 0>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI int fpclassify(_A1 __x) _NOEXCEPT {
+// template on non-double overloads to make them weaker than same overloads from MSVC runtime
+template <class = int>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI int fpclassify(float __x) _NOEXCEPT {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x);
+}
+
+template <class = int>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI int fpclassify(double __x) _NOEXCEPT {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x);
+}
+
+template <class = int>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI int fpclassify(long double __x) _NOEXCEPT {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x);
 }
 
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 40930df24c6d08..47c2d0553a57c7 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -1192,7 +1192,7 @@ swap(basic_stringstream<_CharT, _Traits, _Allocator>& __x,
     __x.swap(__y);
 }
 
-#if defined(_LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1)
+#ifndef _LIBCPP_AVAILABILITY_HAS_NO_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1
 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_stringbuf<char>;
 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_stringstream<char>;
 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostringstream<char>;
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index 8d83063bbeaeb3..e5c58feee55d45 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -874,9 +874,9 @@ public:
     void operator()(pointer __p) _NOEXCEPT
     {
         if (__second_constructed)
-            __alloc_traits::destroy(__na_, _VSTD::addressof(__p->__value_.__get_value().second));
+            __alloc_traits::destroy(__na_, _VSTD::addressof(__p->__get_value().__get_value().second));
         if (__first_constructed)
-            __alloc_traits::destroy(__na_, _VSTD::addressof(__p->__value_.__get_value().first));
+            __alloc_traits::destroy(__na_, _VSTD::addressof(__p->__get_value().__get_value().first));
         if (__p)
             __alloc_traits::deallocate(__na_, __p, 1);
     }
@@ -1828,7 +1828,7 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_map(
         iterator __i = __u.begin();
         while (__u.size() != 0) {
             __table_.__emplace_unique(
-                __u.__table_.remove((__i++).__i_)->__value_.__move());
+                __u.__table_.remove((__i++).__i_)->__get_value().__move());
         }
     }
 }
@@ -1920,9 +1920,9 @@ unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::__construct_node_with_key(const
 {
     __node_allocator& __na = __table_.__node_alloc();
     __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
-    __node_traits::construct(__na, _VSTD::addressof(__h->__value_.__get_value().first), __k);
+    __node_traits::construct(__na, _VSTD::addressof(__h->__get_value().__get_value().first), __k);
     __h.get_deleter().__first_constructed = true;
-    __node_traits::construct(__na, _VSTD::addressof(__h->__value_.__get_value().second));
+    __node_traits::construct(__na, _VSTD::addressof(__h->__get_value().__get_value().second));
     __h.get_deleter().__second_constructed = true;
     return __h;
 }
@@ -2653,7 +2653,7 @@ unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::unordered_multimap(
         while (__u.size() != 0)
         {
             __table_.__insert_multi(
-                __u.__table_.remove((__i++).__i_)->__value_.__move());
+                __u.__table_.remove((__i++).__i_)->__get_value().__move());
         }
     }
 }
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 5e47f12446ff93..f1b4104df4f68f 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -1150,7 +1150,7 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(
     {
         iterator __i = __u.begin();
         while (__u.size() != 0)
-            __table_.__insert_unique(_VSTD::move(__u.__table_.remove(__i++)->__value_));
+            __table_.__insert_unique(_VSTD::move(__u.__table_.remove(__i++)->__get_value()));
     }
 }
 
@@ -1835,7 +1835,7 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
     {
         iterator __i = __u.begin();
         while (__u.size() != 0)
-            __table_.__insert_multi(_VSTD::move(__u.__table_.remove(__i++)->__value_));
+            __table_.__insert_multi(_VSTD::move(__u.__table_.remove(__i++)->__get_value()));
     }
 }
 
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 6d26ce169b600a..4ec6b602371eae 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -833,11 +833,11 @@ private:
 
     template <class _Up>
     _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
-    inline void __push_back_slow_path(_Up&& __x);
+    inline pointer __push_back_slow_path(_Up&& __x);
 
     template <class... _Args>
     _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
-    inline void __emplace_back_slow_path(_Args&&... __args);
+    inline pointer __emplace_back_slow_path(_Args&&... __args);
 
     // The following functions are no-ops outside of AddressSanitizer mode.
     // We call annotations for every allocator, unless explicitly disabled.
@@ -1609,7 +1609,7 @@ vector<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT
 template <class _Tp, class _Allocator>
 template <class _Up>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
-void
+typename vector<_Tp, _Allocator>::pointer
 vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x)
 {
     allocator_type& __a = this->__alloc();
@@ -1618,6 +1618,7 @@ vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x)
     __alloc_traits::construct(__a, std::__to_address(__v.__end_), std::forward<_Up>(__x));
     __v.__end_++;
     __swap_out_circular_buffer(__v);
+    return this->__end_;
 }
 
 template <class _Tp, class _Allocator>
@@ -1626,12 +1627,14 @@ inline _LIBCPP_HIDE_FROM_ABI
 void
 vector<_Tp, _Allocator>::push_back(const_reference __x)
 {
-    if (this->__end_ != this->__end_cap())
-    {
+    pointer __end = this->__end_;
+    if (__end < this->__end_cap()) {
         __construct_one_at_end(__x);
+        ++__end;
+    } else {
+        __end = __push_back_slow_path(__x);
     }
-    else
-        __push_back_slow_path(__x);
+    this->__end_ = __end;
 }
 
 template <class _Tp, class _Allocator>
@@ -1640,18 +1643,20 @@ inline _LIBCPP_HIDE_FROM_ABI
 void
 vector<_Tp, _Allocator>::push_back(value_type&& __x)
 {
-    if (this->__end_ < this->__end_cap())
-    {
+    pointer __end = this->__end_;
+    if (__end < this->__end_cap()) {
         __construct_one_at_end(std::move(__x));
+        ++__end;
+    } else {
+        __end = __push_back_slow_path(std::move(__x));
     }
-    else
-        __push_back_slow_path(std::move(__x));
+    this->__end_ = __end;
 }
 
 template <class _Tp, class _Allocator>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
-void
+typename vector<_Tp, _Allocator>::pointer
 vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args)
 {
     allocator_type& __a = this->__alloc();
@@ -1660,6 +1665,7 @@ vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args)
     __alloc_traits::construct(__a, std::__to_address(__v.__end_), std::forward<_Args>(__args)...);
     __v.__end_++;
     __swap_out_circular_buffer(__v);
+    return this->__end_;
 }
 
 template <class _Tp, class _Allocator>
@@ -1673,14 +1679,16 @@ void
 #endif
 vector<_Tp, _Allocator>::emplace_back(_Args&&... __args)
 {
-    if (this->__end_ < this->__end_cap())
-    {
+    pointer __end = this->__end_;
+    if (__end < this->__end_cap()) {
         __construct_one_at_end(std::forward<_Args>(__args)...);
+        ++__end;
+    } else {
+        __end = __emplace_back_slow_path(std::forward<_Args>(__args)...);
     }
-    else
-        __emplace_back_slow_path(std::forward<_Args>(__args)...);
+    this->__end_ = __end;
 #if _LIBCPP_STD_VER >= 17
-    return this->back();
+    return *(__end - 1);
 #endif
 }
 
diff --git a/libcxx/src/future.cpp b/libcxx/src/future.cpp
index fa1004e1357128..3383b506a2fab3 100644
--- a/libcxx/src/future.cpp
+++ b/libcxx/src/future.cpp
@@ -204,9 +204,7 @@ promise<void>::~promise()
     {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
         if (!__state_->__has_value() && __state_->use_count() > 1)
-            __state_->set_exception(make_exception_ptr(
-                      future_error(make_error_code(future_errc::broken_promise))
-                                                      ));
+            __state_->set_exception(make_exception_ptr(future_error(future_errc::broken_promise)));
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
         __state_->__release_shared();
     }
diff --git a/libcxx/src/ios.instantiations.cpp b/libcxx/src/ios.instantiations.cpp
index 01fe199acb3c62..070f9891947c6f 100644
--- a/libcxx/src/ios.instantiations.cpp
+++ b/libcxx/src/ios.instantiations.cpp
@@ -31,7 +31,7 @@ template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ostream<wchar_t>;
 #endif
 
 // Additional instantiations added later. Whether programs rely on these being
-// available is protected by _LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1.
+// available is protected by _LIBCPP_AVAILABILITY_HAS_NO_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1.
 template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_stringbuf<char>;
 template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_stringstream<char>;
 template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS basic_ostringstream<char>;
diff --git a/libcxx/src/legacy_debug_handler.cpp b/libcxx/src/legacy_debug_handler.cpp
deleted file mode 100644
index cb2025bfc9b6d7..00000000000000
--- a/libcxx/src/legacy_debug_handler.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <__config>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-
-// This file defines the legacy default debug handler and related mechanisms
-// to set it. This is for backwards ABI compatibility with code that has been
-// using this debug handler previously.
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-struct _LIBCPP_TEMPLATE_VIS __libcpp_debug_info {
-  _LIBCPP_EXPORTED_FROM_ABI string what() const;
-
-  const char* __file_;
-  int __line_;
-  const char* __pred_;
-  const char* __msg_;
-};
-
-std::string __libcpp_debug_info::what() const {
-  string msg = __file_;
-  msg += ":" + std::to_string(__line_) + ": _LIBCPP_ASSERT '";
-  msg += __pred_;
-  msg += "' failed. ";
-  msg += __msg_;
-  return msg;
-}
-
-_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __libcpp_abort_debug_function(__libcpp_debug_info const& info) {
-  std::fprintf(stderr, "%s\n", info.what().c_str());
-  std::abort();
-}
-
-typedef void (*__libcpp_debug_function_type)(__libcpp_debug_info const&);
-
-_LIBCPP_EXPORTED_FROM_ABI
-constinit __libcpp_debug_function_type __libcpp_debug_function = __libcpp_abort_debug_function;
-
-_LIBCPP_EXPORTED_FROM_ABI
-bool __libcpp_set_debug_function(__libcpp_debug_function_type __func) {
-  __libcpp_debug_function = __func;
-  return true;
-}
-
-_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 0d5fa6959f1788..48dd233462ab3b 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -1,3 +1,4 @@
+include(HandleLitArguments)
 add_subdirectory(tools)
 
 # By default, libcxx and libcxxabi share a library directory.
@@ -9,40 +10,32 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
-macro(serialize_lit_param param value)
-  string(APPEND SERIALIZED_LIT_PARAMS "config.${param} = ${value}\n")
-endmacro()
-
 if (LIBCXX_EXECUTOR)
   message(DEPRECATION "LIBCXX_EXECUTOR is deprecated, please add executor=... to LIBCXX_TEST_PARAMS")
-  serialize_lit_param(executor "\"${LIBCXX_EXECUTOR}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBCXX_EXECUTOR}")
 endif()
 
 if (NOT LIBCXX_ENABLE_EXCEPTIONS)
-  serialize_lit_param(enable_exceptions False)
+  serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
 
 if (NOT LIBCXX_ENABLE_RTTI)
-  serialize_lit_param(enable_rtti False)
+  serialize_lit_param(SERIALIZED_LIT_PARAMS enable_rtti False)
 endif()
 
-serialize_lit_param(hardening_mode "\"${LIBCXX_HARDENING_MODE}\"")
+serialize_lit_string_param(SERIALIZED_LIT_PARAMS hardening_mode "${LIBCXX_HARDENING_MODE}")
 
 if (CMAKE_CXX_COMPILER_TARGET)
-  serialize_lit_param(target_triple "\"${CMAKE_CXX_COMPILER_TARGET}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS target_triple "${CMAKE_CXX_COMPILER_TARGET}")
 else()
-  serialize_lit_param(target_triple "\"${LLVM_DEFAULT_TARGET_TRIPLE}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS target_triple "${LLVM_DEFAULT_TARGET_TRIPLE}")
 endif()
 
 if (LLVM_USE_SANITIZER)
-  serialize_lit_param(use_sanitizer "\"${LLVM_USE_SANITIZER}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS use_sanitizer "${LLVM_USE_SANITIZER}")
 endif()
 
-foreach(param IN LISTS LIBCXX_TEST_PARAMS)
-  string(REGEX REPLACE "(.+)=(.+)" "\\1" name "${param}")
-  string(REGEX REPLACE "(.+)=(.+)" "\\2" value "${param}")
-  serialize_lit_param("${name}" "\"${value}\"")
-endforeach()
+serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
 if (NOT DEFINED LIBCXX_TEST_DEPS)
   message(FATAL_ERROR "Expected LIBCXX_TEST_DEPS to be defined")
@@ -68,9 +61,9 @@ if (MSVC)
     set(cxx_lib "${cxx_lib}d")
   endif()
 
-  serialize_lit_param(dbg_include "\"${dbg_include}\"")
-  serialize_lit_param(fms_runtime_lib "\"${fms_runtime_lib}\"")
-  serialize_lit_param(cxx_lib "\"${cxx_lib}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS dbg_include "${dbg_include}")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS fms_runtime_lib "${fms_runtime_lib}")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS cxx_lib "${cxx_lib}")
 endif()
 
 if (LIBCXX_INCLUDE_TESTS)
diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
index a49c9e3e203f96..95b18eba26e775 100644
--- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
@@ -9,7 +9,7 @@
 // Check that format functions aren't marked [[nodiscard]] when
 // _LIBCPP_DISBALE_NODISCARD_EXT is defined
 
-// UNSUPPORTED: c++03, c++11, c++14 ,c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb
 
 // XFAIL: libcpp-has-no-incomplete-tzdb
diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
index c76bb00a3723f6..ab68b1fb44f96e 100644
--- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
@@ -8,7 +8,7 @@
 
 // Check that format functions are marked [[nodiscard]] as a conforming extension
 
-// UNSUPPORTED: c++03, c++11, c++14 ,c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb
 
 // XFAIL: libcpp-has-no-incomplete-tzdb
diff --git a/libcxx/test/libcxx/diagnostics/format.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/format.nodiscard_extensions.compile.pass.cpp
index e9ba2ae83b04c3..d443687bac338d 100644
--- a/libcxx/test/libcxx/diagnostics/format.nodiscard_extensions.compile.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/format.nodiscard_extensions.compile.pass.cpp
@@ -12,7 +12,7 @@
 // TODO FMT This test should not require std::to_chars(floating-point)
 // XFAIL: availability-fp_to_chars-missing
 
-// UNSUPPORTED: c++03, c++11, c++14 ,c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_NODISCARD_EXT
 
diff --git a/libcxx/test/libcxx/diagnostics/format.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/format.nodiscard_extensions.verify.cpp
index 4af8210ce618d5..ede6988f8bb4ee 100644
--- a/libcxx/test/libcxx/diagnostics/format.nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/format.nodiscard_extensions.verify.cpp
@@ -11,7 +11,7 @@
 // TODO FMT This test should not require std::to_chars(floating-point)
 // XFAIL: availability-fp_to_chars-missing
 
-// UNSUPPORTED: c++03, c++11, c++14 ,c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <format>
 
diff --git a/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.compile.pass.cpp
index 1577601908c99a..5ec227311472e8 100644
--- a/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.compile.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.compile.pass.cpp
@@ -9,7 +9,7 @@
 // Check that ranges algorithms aren't marked [[nodiscard]] when
 // _LIBCPP_DISBALE_NODISCARD_EXT is defined
 
-// UNSUPPORTED: c++03, c++11, c++14 ,c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_NODISCARD_EXT
 
diff --git a/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.verify.cpp
index 77ac5f3f779035..d11dcfcd0d2e41 100644
--- a/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/ranges.nodiscard_extensions.verify.cpp
@@ -8,7 +8,7 @@
 
 // Check that ranges algorithms are marked [[nodiscard]] as a conforming extension
 
-// UNSUPPORTED: c++03, c++11, c++14 ,c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <algorithm>
 
diff --git a/libcxx/test/libcxx/diagnostics/view_adaptors.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/view_adaptors.nodiscard_extensions.verify.cpp
index 6cd9739f054c68..35c4ab570848d3 100644
--- a/libcxx/test/libcxx/diagnostics/view_adaptors.nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/view_adaptors.nodiscard_extensions.verify.cpp
@@ -8,7 +8,7 @@
 
 // Check that view adaptors are marked [[nodiscard]] as a conforming extension
 
-// UNSUPPORTED: c++03, c++11, c++14 ,c++17, c++20
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 #include <ranges>
 #include <vector>
diff --git a/libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp b/libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp
new file mode 100644
index 00000000000000..772198304b415a
--- /dev/null
+++ b/libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// REQUIRES: -fsized-deallocation
+// ADDITIONAL_COMPILE_FLAGS: -fsized-deallocation
+
+// This test will fail with ASan if the implementation passes different sizes
+// to corresponding allocation and deallocation functions.
+
+#include <memory>
+
+int main(int, char**) {
+  std::allocate_shared<int[]>(std::allocator<int>{}, 10);
+  std::make_shared<int[]>(10);
+
+  std::allocate_shared<int[10]>(std::allocator<int>{});
+  std::make_shared<int[10]>();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
index 2918d4f86afb5a..b411ce198e253f 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-
-// clang-cl and cl currently don't support [[no_unique_address]]
-// XFAIL: msvc
+// XFAIL: msvc && (clang-16 || clang-17)
 
 // class lazy_split_view {
 //   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
index c04dbd7ac0d695..0d8bfbc0316dac 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-
-// clang-cl and cl currently don't support [[no_unique_address]]
-// XFAIL: msvc
+// XFAIL: msvc && (clang-16 || clang-17)
 
 // class split_view {
 //   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
diff --git a/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
index f2fd2d628bc1dc..8359d267245fea 100644
--- a/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
@@ -8,9 +8,7 @@
 
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-
-// clang-cl and cl currently don't support [[no_unique_address]]
-// XFAIL: msvc
+// XFAIL: msvc && (clang-16 || clang-17)
 
 // Test the libc++ extension that the value stored in `std::ranges::istream_view` has been marked
 // as _LIBCPP_NO_UNIQUE_ADDRESS
diff --git a/libcxx/test/std/depr/depr.numeric.imits.has.denorm/deprecated.verify.cpp b/libcxx/test/std/depr/depr.numeric.imits.has.denorm/deprecated.verify.cpp
new file mode 100644
index 00000000000000..82f838c57f1e44
--- /dev/null
+++ b/libcxx/test/std/depr/depr.numeric.imits.has.denorm/deprecated.verify.cpp
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-unused-value
+
+#include <limits>
+
+#include "type_algorithms.h"
+
+void func() {
+  std::numeric_limits<bool>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<bool>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<bool>::denorm_min();
+
+  std::numeric_limits<int>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<int>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<int>::denorm_min();
+
+  std::numeric_limits<float>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<float>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<float>::denorm_min();
+
+  std::numeric_limits<double>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<double>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<double>::denorm_min();
+
+  std::numeric_limits<long double>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<long double>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<long double>::denorm_min();
+
+  std::numeric_limits<const bool>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const bool>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const bool>::denorm_min();
+
+  std::numeric_limits<const int>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const int>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const int>::denorm_min();
+
+  std::numeric_limits<const float>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const float>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const float>::denorm_min();
+
+  std::numeric_limits<const double>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const double>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const double>::denorm_min();
+
+  std::numeric_limits<const long double>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const long double>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const long double>::denorm_min();
+
+  std::numeric_limits<volatile bool>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<volatile bool>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<volatile bool>::denorm_min();
+
+  std::numeric_limits<volatile int>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<volatile int>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<volatile int>::denorm_min();
+
+  std::numeric_limits<volatile float>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<volatile float>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<volatile float>::denorm_min();
+
+  std::numeric_limits<volatile double>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<volatile double>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<volatile double>::denorm_min();
+
+  std::numeric_limits<volatile long double>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<volatile long double>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<volatile long double>::denorm_min();
+
+  std::numeric_limits<const volatile bool>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const volatile bool>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const volatile bool>::denorm_min();
+
+  std::numeric_limits<const volatile int>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const volatile int>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const volatile int>::denorm_min();
+
+  std::numeric_limits<const volatile float>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const volatile float>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const volatile float>::denorm_min();
+
+  std::numeric_limits<const volatile double>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const volatile double>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const volatile double>::denorm_min();
+
+  std::numeric_limits<const volatile long double>::has_denorm;      // expected-warning {{'has_denorm' is deprecated}}
+  std::numeric_limits<const volatile long double>::has_denorm_loss; // expected-warning {{'has_denorm_loss' is deprecated}}
+  std::numeric_limits<const volatile long double>::denorm_min();
+
+  std::denorm_indeterminate; // expected-warning {{'denorm_indeterminate' is deprecated}}
+  std::denorm_absent;        // expected-warning {{'denorm_absent' is deprecated}}
+  std::denorm_present;       // expected-warning {{'denorm_present' is deprecated}}
+}
diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
index 1d4c2686c7bb1c..9d0b27c24eb891 100644
--- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/const_data_members.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 #include <limits>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
index 7ca5a3defa8c0c..ae9f3d733ea6ec 100644
--- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/has_denorm.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // test numeric_limits
 
 // has_denorm
diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
index 7b88f3c2b292d8..e5eac655c18f17 100644
--- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/has_denorm_loss.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // test numeric_limits
 
 // has_denorm_loss
diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits/default.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits/default.pass.cpp
index 88ca21fbba481d..ab96903c24f03a 100644
--- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits/default.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits/default.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // test numeric_limits
 
 // The default numeric_limits<T> template shall have all members, but with
diff --git a/libcxx/test/std/language.support/support.limits/limits/round.style/check_values.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/round.style/check_values.pass.cpp
index bb47763f376c81..e40526c7d02248 100644
--- a/libcxx/test/std/language.support/support.limits/limits/round.style/check_values.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/limits/round.style/check_values.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // test numeric_limits
 
 // float_denorm_style
diff --git a/libcxx/test/std/numerics/c.math/cmath.pass.cpp b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
index e2062cc7ecfd91..11a3de748cb7a6 100644
--- a/libcxx/test/std/numerics/c.math/cmath.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
@@ -603,12 +603,20 @@ void test_fpclassify()
     static_assert((std::is_same<decltype(std::fpclassify(0)), int>::value), "");
     static_assert((std::is_same<decltype(std::fpclassify((long double)0)), int>::value), "");
     static_assert((std::is_same<decltype(fpclassify(Ambiguous())), Ambiguous>::value), "");
+    static_assert((std::is_same<decltype(fpclassify(Value<float>())), int>::value), "");
+    static_assert((std::is_same<decltype(fpclassify(Value<double>())), int>::value), "");
+    static_assert((std::is_same<decltype(fpclassify(Value<long double>())), int>::value), "");
+    ASSERT_NOEXCEPT(std::fpclassify((float)0));
+    ASSERT_NOEXCEPT(std::fpclassify((double)0));
+    ASSERT_NOEXCEPT(std::fpclassify((long double)0));
+    ASSERT_NOEXCEPT(std::fpclassify(0));
     assert(std::fpclassify(-1.0) == FP_NORMAL);
     assert(std::fpclassify(0) == FP_ZERO);
     assert(std::fpclassify(1) == FP_NORMAL);
     assert(std::fpclassify(-1) == FP_NORMAL);
     assert(std::fpclassify(std::numeric_limits<int>::max()) == FP_NORMAL);
     assert(std::fpclassify(std::numeric_limits<int>::min()) == FP_NORMAL);
+    assert(std::fpclassify(Value<double, 1>()) == FP_NORMAL);
 }
 
 void test_isfinite()
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp
index 07092b74be8738..b894bc542be10b 100644
--- a/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp
@@ -10,9 +10,10 @@
 
 // constexpr auto size() const requires see below;
 
-#include <ranges>
 #include <cassert>
+#include <concepts>
 #include <limits>
+#include <ranges>
 
 #include "test_macros.h"
 #include "types.h"
@@ -89,6 +90,15 @@ constexpr bool test() {
     assert(io.size() == 0);
   }
 
+  // Make sure iota_view<short, short> works properly. For details,
+  // see https://github.com/llvm/llvm-project/issues/67551.
+  {
+    static_assert(std::ranges::sized_range<std::ranges::iota_view<short, short>>);
+    std::ranges::iota_view<short, short> io(10, 20);
+    std::same_as<unsigned int> auto sz = io.size();
+    assert(sz == 10);
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/thread/futures/futures.future_error/code.pass.cpp b/libcxx/test/std/thread/futures/futures.future_error/code.pass.cpp
index 99d2ade796e458..22d74bba803d9b 100644
--- a/libcxx/test/std/thread/futures/futures.future_error/code.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.future_error/code.pass.cpp
@@ -9,49 +9,43 @@
 // UNSUPPORTED: no-threads
 
 // <future>
-
+//
 // class future_error
-//     future_error(error_code __ec);  // exposition only
-//     explicit future_error(future_errc _Ev) : __ec_(make_error_code(_Ev)) {} // C++17
-
-// const error_code& code() const throw();
+//
+// const error_code& code() const noexcept;
 
-#include <future>
 #include <cassert>
+#include <future>
+#include <utility>
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-        std::error_code ec = std::make_error_code(std::future_errc::broken_promise);
-        std::future_error f(ec);
-        assert(f.code() == ec);
-    }
-    {
-        std::error_code ec = std::make_error_code(std::future_errc::future_already_retrieved);
-        std::future_error f(ec);
-        assert(f.code() == ec);
-    }
-    {
-        std::error_code ec = std::make_error_code(std::future_errc::promise_already_satisfied);
-        std::future_error f(ec);
-        assert(f.code() == ec);
-    }
-    {
-        std::error_code ec = std::make_error_code(std::future_errc::no_state);
-        std::future_error f(ec);
-        assert(f.code() == ec);
-    }
-#if TEST_STD_VER > 14
-    {
-        std::future_error f(std::future_errc::broken_promise);
-        assert(f.code() == std::make_error_code(std::future_errc::broken_promise));
-    }
-    {
-        std::future_error f(std::future_errc::no_state);
-        assert(f.code() == std::make_error_code(std::future_errc::no_state));
-    }
+int main(int, char**) {
+  ASSERT_NOEXCEPT(std::declval<std::future_error const&>().code());
+  ASSERT_SAME_TYPE(decltype(std::declval<std::future_error const&>().code()), std::error_code const&);
+
+  // Before C++17, we can't construct std::future_error directly in a standards-conforming way
+#if TEST_STD_VER >= 17
+  {
+    std::future_error const f(std::future_errc::broken_promise);
+    std::error_code const& code = f.code();
+    assert(code == std::make_error_code(std::future_errc::broken_promise));
+  }
+  {
+    std::future_error const f(std::future_errc::future_already_retrieved);
+    std::error_code const& code = f.code();
+    assert(code == std::make_error_code(std::future_errc::future_already_retrieved));
+  }
+  {
+    std::future_error const f(std::future_errc::promise_already_satisfied);
+    std::error_code const& code = f.code();
+    assert(code == std::make_error_code(std::future_errc::promise_already_satisfied));
+  }
+  {
+    std::future_error const f(std::future_errc::no_state);
+    std::error_code const& code = f.code();
+    assert(code == std::make_error_code(std::future_errc::no_state));
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/futures/futures.future_error/ctor.pass.cpp b/libcxx/test/std/thread/futures/futures.future_error/ctor.pass.cpp
new file mode 100644
index 00000000000000..bce34105a50922
--- /dev/null
+++ b/libcxx/test/std/thread/futures/futures.future_error/ctor.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <future>
+
+// class future_error
+//
+// explicit future_error(future_errc e); // since C++17
+
+#include <cassert>
+#include <future>
+#include <type_traits>
+
+int main(int, char**) {
+  {
+    std::future_error f(std::future_errc::broken_promise);
+    assert(f.code() == std::make_error_code(std::future_errc::broken_promise));
+  }
+  {
+    std::future_error f(std::future_errc::future_already_retrieved);
+    assert(f.code() == std::make_error_code(std::future_errc::future_already_retrieved));
+  }
+  {
+    std::future_error f(std::future_errc::promise_already_satisfied);
+    assert(f.code() == std::make_error_code(std::future_errc::promise_already_satisfied));
+  }
+  {
+    std::future_error f(std::future_errc::no_state);
+    assert(f.code() == std::make_error_code(std::future_errc::no_state));
+  }
+
+  // Make sure the constructor is explicit
+  static_assert(!std::is_convertible_v<std::future_errc, std::future_error>);
+
+  return 0;
+}
diff --git a/libcxx/test/std/thread/futures/futures.future_error/types.pass.cpp b/libcxx/test/std/thread/futures/futures.future_error/types.compile.pass.cpp
similarity index 72%
rename from libcxx/test/std/thread/futures/futures.future_error/types.pass.cpp
rename to libcxx/test/std/thread/futures/futures.future_error/types.compile.pass.cpp
index 8e77c01e73cea4..478f0c2a676aa8 100644
--- a/libcxx/test/std/thread/futures/futures.future_error/types.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.future_error/types.compile.pass.cpp
@@ -15,12 +15,4 @@
 #include <future>
 #include <type_traits>
 
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_convertible<std::future_error*,
-                                       std::logic_error*>::value), "");
-
-  return 0;
-}
+static_assert(std::is_convertible<std::future_error*, std::logic_error*>::value, "");
diff --git a/libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp b/libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp
index 48ae35b6eb883d..fba2bd80d7f89a 100644
--- a/libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp
@@ -13,39 +13,53 @@
 //
 // XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11}}
 
-// <future>
+// VC Runtime's std::exception::what() method is not marked as noexcept, so
+// this fails.
+// UNSUPPORTED: target=x86_64-pc-windows-msvc
 
+// <future>
+//
 // class future_error
+//
+// const char* what() const noexcept;
 
-// const char* what() const throw();
-
-#include <future>
-#include <cstring>
 #include <cassert>
+#include <future>
+#include <string_view>
+#include <utility>
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-        std::future_error f(std::make_error_code(std::future_errc::broken_promise));
-        LIBCPP_ASSERT(std::strcmp(f.what(), "The associated promise has been destructed prior "
-                      "to the associated state becoming ready.") == 0);
-    }
-    {
-        std::future_error f(std::make_error_code(std::future_errc::future_already_retrieved));
-        LIBCPP_ASSERT(std::strcmp(f.what(), "The future has already been retrieved from "
-                      "the promise or packaged_task.") == 0);
-    }
-    {
-        std::future_error f(std::make_error_code(std::future_errc::promise_already_satisfied));
-        LIBCPP_ASSERT(std::strcmp(f.what(), "The state of the promise has already been set.") == 0);
-    }
-    {
-        std::future_error f(std::make_error_code(std::future_errc::no_state));
-        LIBCPP_ASSERT(std::strcmp(f.what(), "Operation not permitted on an object without "
-                      "an associated state.") == 0);
-    }
+int main(int, char**) {
+  ASSERT_NOEXCEPT(std::declval<std::future_error const&>().what());
+  ASSERT_SAME_TYPE(decltype(std::declval<std::future_error const&>().what()), char const*);
+
+  // Before C++17, we can't construct std::future_error directly in a standards-conforming way
+#if TEST_STD_VER >= 17
+  {
+    std::future_error const f(std::future_errc::broken_promise);
+    char const* what = f.what();
+    LIBCPP_ASSERT(what == std::string_view{"The associated promise has been destructed prior "
+                                           "to the associated state becoming ready."});
+  }
+  {
+    std::future_error f(std::future_errc::future_already_retrieved);
+    char const* what = f.what();
+    LIBCPP_ASSERT(what == std::string_view{"The future has already been retrieved from "
+                                           "the promise or packaged_task."});
+  }
+  {
+    std::future_error f(std::future_errc::promise_already_satisfied);
+    char const* what = f.what();
+    LIBCPP_ASSERT(what == std::string_view{"The state of the promise has already been set."});
+  }
+  {
+    std::future_error f(std::future_errc::no_state);
+    char const* what = f.what();
+    LIBCPP_ASSERT(what == std::string_view{"Operation not permitted on an object without "
+                                           "an associated state."});
+  }
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.const/pr40459.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.const/pr40459.pass.cpp
new file mode 100644
index 00000000000000..326c2193f592ec
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.const/pr40459.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <memory>
+
+// weak_ptr
+
+// template<class Y> weak_ptr(const weak_ptr<Y>& r);
+// template<class Y> weak_ptr(weak_ptr<Y>&& r);
+//
+// Regression test for https://github.com/llvm/llvm-project/issues/40459
+// Verify that these constructors never attempt a derived-to-virtual-base
+// conversion on a dangling weak_ptr.
+
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "test_macros.h"
+
+struct A {
+  int i;
+  virtual ~A() {}
+};
+struct B : public virtual A {
+  int j;
+};
+struct Deleter {
+  void operator()(void*) const {
+    // do nothing
+  }
+};
+
+int main(int, char**) {
+#if TEST_STD_VER >= 11
+  alignas(B) char buffer[sizeof(B)];
+#else
+  std::aligned_storage<sizeof(B), std::alignment_of<B>::value>::type buffer;
+#endif
+  B* pb                 = ::new ((void*)&buffer) B();
+  std::shared_ptr<B> sp = std::shared_ptr<B>(pb, Deleter());
+  std::weak_ptr<B> wp   = sp;
+  sp                    = nullptr;
+  assert(wp.expired());
+
+  // Overwrite the B object with junk.
+  std::memset(&buffer, '*', sizeof(buffer));
+
+  std::weak_ptr<A> wq = wp;
+  assert(wq.expired());
+  std::weak_ptr<A> wr = std::move(wp);
+  assert(wr.expired());
+
+  return 0;
+}
diff --git a/libcxx/utils/data/ignore_format.txt b/libcxx/utils/data/ignore_format.txt
index 34b647a3bd2123..24e207169c8148 100644
--- a/libcxx/utils/data/ignore_format.txt
+++ b/libcxx/utils/data/ignore_format.txt
@@ -84,7 +84,6 @@ libcxx/include/__algorithm/swap_ranges.h
 libcxx/include/__algorithm/transform.h
 libcxx/include/__algorithm/uniform_random_bit_generator_adaptor.h
 libcxx/include/__algorithm/unwrap_iter.h
-libcxx/include/__algorithm/unwrap_range.h
 libcxx/include/any
 libcxx/include/array
 libcxx/include/__atomic/atomic_base.h
@@ -524,7 +523,6 @@ libcxx/src/include/ryu/ryu.h
 libcxx/src/include/sso_allocator.h
 libcxx/src/ios.cpp
 libcxx/src/iostream.cpp
-libcxx/src/legacy_debug_handler.cpp
 libcxx/src/locale.cpp
 libcxx/src/memory.cpp
 libcxx/src/mutex.cpp
@@ -5616,9 +5614,6 @@ libcxx/test/std/thread/futures/futures.errors/equivalent_int_error_condition.pas
 libcxx/test/std/thread/futures/futures.errors/future_category.pass.cpp
 libcxx/test/std/thread/futures/futures.errors/make_error_code.pass.cpp
 libcxx/test/std/thread/futures/futures.errors/make_error_condition.pass.cpp
-libcxx/test/std/thread/futures/futures.future_error/code.pass.cpp
-libcxx/test/std/thread/futures/futures.future_error/types.pass.cpp
-libcxx/test/std/thread/futures/futures.future_error/what.pass.cpp
 libcxx/test/std/thread/futures/futures.overview/future_errc.pass.cpp
 libcxx/test/std/thread/futures/futures.overview/future_status.pass.cpp
 libcxx/test/std/thread/futures/futures.overview/is_error_code_enum_future_errc.pass.cpp
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index b65ac3a9d16422..9f95c736d63f46 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -1,4 +1,5 @@
 include(AddLLVM) # for configure_lit_site_cfg and add_lit_testsuite
+include(HandleLitArguments)
 macro(pythonize_bool var)
   if (${var})
     set(${var} True)
@@ -23,40 +24,32 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
-macro(serialize_lit_param param value)
-  string(APPEND SERIALIZED_LIT_PARAMS "config.${param} = ${value}\n")
-endmacro()
-
 if (LIBCXXABI_EXECUTOR)
   message(DEPRECATION "LIBCXXABI_EXECUTOR is deprecated, please add executor=... to LIBCXXABI_TEST_PARAMS")
-  serialize_lit_param(executor "\"${LIBCXXABI_EXECUTOR}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBCXXABI_EXECUTOR}")
 endif()
 
 if (NOT LIBCXXABI_ENABLE_EXCEPTIONS)
-  serialize_lit_param(enable_exceptions False)
+  serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
 
 if (LIBCXXABI_ENABLE_ASSERTIONS)
-  serialize_lit_param(enable_assertions True)
+  serialize_lit_param(SERIALIZED_LIT_PARAMS enable_assertions True)
 endif()
 
-serialize_lit_param(enable_experimental False)
+serialize_lit_param(SERIALIZED_LIT_PARAMS enable_experimental False)
 
 if (LLVM_USE_SANITIZER)
-  serialize_lit_param(use_sanitizer "\"${LLVM_USE_SANITIZER}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS use_sanitizer "${LLVM_USE_SANITIZER}")
 endif()
 
 if (CMAKE_CXX_COMPILER_TARGET)
-  serialize_lit_param(target_triple "\"${CMAKE_CXX_COMPILER_TARGET}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS target_triple "${CMAKE_CXX_COMPILER_TARGET}")
 else()
-  serialize_lit_param(target_triple "\"${LLVM_DEFAULT_TARGET_TRIPLE}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS target_triple "${LLVM_DEFAULT_TARGET_TRIPLE}")
 endif()
 
-foreach(param IN LISTS LIBCXXABI_TEST_PARAMS)
-  string(REGEX REPLACE "(.+)=(.+)" "\\1" name "${param}")
-  string(REGEX REPLACE "(.+)=(.+)" "\\2" value "${param}")
-  serialize_lit_param("${name}" "\"${value}\"")
-endforeach()
+serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXXABI_TEST_PARAMS)
 
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/configs/cmake-bridge.cfg.in"
                "${CMAKE_CURRENT_BINARY_DIR}/cmake-bridge.cfg"
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 452d988c3726b5..abb019b88ebabd 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -16,6 +16,7 @@ set(LIBUNWIND_C_SOURCES
     UnwindLevel1.c
     UnwindLevel1-gcc-ext.c
     Unwind-sjlj.c
+    Unwind-wasm.c
     )
 set_source_files_properties(${LIBUNWIND_C_SOURCES}
                             PROPERTIES
diff --git a/libunwind/src/Unwind-wasm.c b/libunwind/src/Unwind-wasm.c
index 35f2f9aeab0804..f7f39d38b59c18 100644
--- a/libunwind/src/Unwind-wasm.c
+++ b/libunwind/src/Unwind-wasm.c
@@ -10,13 +10,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "config.h"
-#include "unwind.h"
 #include <stdbool.h>
-#include <threads.h>
+
+#include "config.h"
 
 #ifdef __USING_WASM_EXCEPTIONS__
 
+#include "unwind.h"
+#include <threads.h>
+
 _Unwind_Reason_Code __gxx_personality_wasm0(int version, _Unwind_Action actions,
                                             uint64_t exceptionClass,
                                             _Unwind_Exception *unwind_exception,
@@ -118,4 +120,4 @@ _Unwind_GetRegionStart(struct _Unwind_Context *context) {
   return 0;
 }
 
-#endif
+#endif // defined(__USING_WASM_EXCEPTIONS__)
diff --git a/libunwind/test/CMakeLists.txt b/libunwind/test/CMakeLists.txt
index 084da0ab8f354d..21dfbb0a84f0a8 100644
--- a/libunwind/test/CMakeLists.txt
+++ b/libunwind/test/CMakeLists.txt
@@ -1,4 +1,5 @@
 include(AddLLVM) # for add_lit_testsuite
+include(HandleLitArguments)
 macro(pythonize_bool var)
   if (${var})
     set(${var} True)
@@ -14,32 +15,24 @@ pythonize_bool(LIBUNWIND_USES_ARM_EHABI)
 set(AUTO_GEN_COMMENT "## Autogenerated by libunwind configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
-macro(serialize_lit_param param value)
-  string(APPEND SERIALIZED_LIT_PARAMS "config.${param} = ${value}\n")
-endmacro()
-
 if (LIBUNWIND_EXECUTOR)
   message(DEPRECATION "LIBUNWIND_EXECUTOR is deprecated, please add executor=... to LIBUNWIND_TEST_PARAMS")
-  serialize_lit_param(executor "\"${LIBUNWIND_EXECUTOR}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBUNWIND_EXECUTOR}")
 endif()
 
-serialize_lit_param(enable_experimental False)
+serialize_lit_param(SERIALIZED_LIT_PARAMS enable_experimental False)
 
 if (LLVM_USE_SANITIZER)
-  serialize_lit_param(use_sanitizer "\"${LLVM_USE_SANITIZER}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS use_sanitizer "${LLVM_USE_SANITIZER}")
 endif()
 
 if (CMAKE_CXX_COMPILER_TARGET)
-  serialize_lit_param(target_triple "\"${CMAKE_CXX_COMPILER_TARGET}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS target_triple "${CMAKE_CXX_COMPILER_TARGET}")
 else()
-  serialize_lit_param(target_triple "\"${LLVM_DEFAULT_TARGET_TRIPLE}\"")
+  serialize_lit_string_param(SERIALIZED_LIT_PARAMS target_triple "${LLVM_DEFAULT_TARGET_TRIPLE}")
 endif()
 
-foreach(param IN LISTS LIBUNWIND_TEST_PARAMS)
-  string(REGEX REPLACE "(.+)=(.+)" "\\1" name "${param}")
-  string(REGEX REPLACE "(.+)=(.+)" "\\2" value "${param}")
-  serialize_lit_param("${name}" "\"${value}\"")
-endforeach()
+serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBUNWIND_TEST_PARAMS)
 
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/configs/cmake-bridge.cfg.in"
                "${CMAKE_CURRENT_BINARY_DIR}/cmake-bridge.cfg"
diff --git a/lld/COFF/CMakeLists.txt b/lld/COFF/CMakeLists.txt
index acbd2e5fe2a52b..2bbadf75bfa1e7 100644
--- a/lld/COFF/CMakeLists.txt
+++ b/lld/COFF/CMakeLists.txt
@@ -25,6 +25,7 @@ add_lld_library(lldCOFF
   LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
   BinaryFormat
+  BitWriter
   Core
   DebugInfoCodeView
   DebugInfoDWARF
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 4ade2c953c73e4..2d5d4d63d69fa4 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -48,6 +48,8 @@ enum class ExportSource {
   ModuleDefinition,
 };
 
+enum class EmitKind { Obj, LLVM, ASM };
+
 // Represents an /export option.
 struct Export {
   StringRef name;       // N in /export:N or /export:E=N
@@ -70,7 +72,7 @@ struct Export {
   StringRef symbolName;
   StringRef exportName; // Name in DLL
 
-  bool operator==(const Export &e) {
+  bool operator==(const Export &e) const {
     return (name == e.name && extName == e.extName &&
             aliasTarget == e.aliasTarget &&
             ordinal == e.ordinal && noname == e.noname &&
@@ -284,6 +286,7 @@ struct Configuration {
   uint32_t minorSubsystemVersion = 0;
   uint32_t timestamp = 0;
   uint32_t functionPadMin = 0;
+  uint32_t timeTraceGranularity = 0;
   bool dynamicBase = true;
   bool allowBind = true;
   bool cetCompat = false;
@@ -307,10 +310,12 @@ struct Configuration {
   bool swaprunNet = false;
   bool thinLTOEmitImportsFiles;
   bool thinLTOIndexOnly;
+  bool timeTraceEnabled = false;
   bool autoImport = false;
   bool pseudoRelocs = false;
   bool stdcallFixup = false;
   bool writeCheckSum = false;
+  EmitKind emit = EmitKind::Obj;
 };
 
 } // namespace lld::coff
diff --git a/lld/COFF/DebugTypes.cpp b/lld/COFF/DebugTypes.cpp
index 23ddccea695f88..5071b7b79d23e3 100644
--- a/lld/COFF/DebugTypes.cpp
+++ b/lld/COFF/DebugTypes.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TimeProfiler.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -1068,6 +1069,7 @@ TypeMerger::~TypeMerger() = default;
 void TypeMerger::mergeTypesWithGHash() {
   // Load ghashes. Do type servers and PCH objects first.
   {
+    llvm::TimeTraceScope timeScope("Load GHASHes");
     ScopedTimer t1(ctx.loadGHashTimer);
     parallelForEach(dependencySources,
                     [&](TpiSource *source) { source->loadGHashes(); });
@@ -1075,6 +1077,7 @@ void TypeMerger::mergeTypesWithGHash() {
                     [&](TpiSource *source) { source->loadGHashes(); });
   }
 
+  llvm::TimeTraceScope timeScope("Merge types (GHASH)");
   ScopedTimer t2(ctx.mergeGHashTimer);
   GHashState ghashState;
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 61a04a74aa6027..c7f4e1f767a543 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -247,6 +247,7 @@ void LinkerDriver::enqueuePath(StringRef path, bool wholeArchive, bool lazy) {
       createFutureForFile(std::string(path)));
   std::string pathStr = std::string(path);
   enqueueTask([=]() {
+    llvm::TimeTraceScope timeScope("File: ", path);
     auto [mb, ec] = future->get();
     if (ec) {
       // Retry reading the file (synchronously) now that we may have added
@@ -332,6 +333,7 @@ void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
       reportBufferError(mbOrErr.takeError(), check(c.getFullName()));
     MemoryBufferRef mb = mbOrErr.get();
     enqueueTask([=]() {
+      llvm::TimeTraceScope timeScope("Archive: ", mb.getBufferIdentifier());
       ctx.driver.addArchiveBuffer(mb, toCOFFString(ctx, sym), parentName,
                                   offsetInArchive);
     });
@@ -348,6 +350,8 @@ void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
     auto mbOrErr = future->get();
     if (mbOrErr.second)
       reportBufferError(errorCodeToError(mbOrErr.second), childName);
+    llvm::TimeTraceScope timeScope("Archive: ",
+                                   mbOrErr.first->getBufferIdentifier());
     // Pass empty string as archive name so that the original filename is
     // used as the buffer identifier.
     ctx.driver.addArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
@@ -645,23 +649,19 @@ void LinkerDriver::addClangLibSearchPaths(const std::string &argv0) {
 
   SmallString<128> libDir(rootDir);
   sys::path::append(libDir, "lib");
-  // We need to prepend the paths here in order to make sure that we always
-  // try to link the clang versions of the builtins over the ones supplied by
-  // MSVC.
-  searchPaths.insert(searchPaths.begin(), saver().save(libDir.str()));
 
   // Add the resource dir library path
   SmallString<128> runtimeLibDir(rootDir);
   sys::path::append(runtimeLibDir, "lib", "clang",
                     std::to_string(LLVM_VERSION_MAJOR), "lib");
-  searchPaths.insert(searchPaths.begin(), saver().save(runtimeLibDir.str()));
-
   // Resource dir + osname, which is hardcoded to windows since we are in the
   // COFF driver.
   SmallString<128> runtimeLibDirWithOS(runtimeLibDir);
   sys::path::append(runtimeLibDirWithOS, "windows");
-  searchPaths.insert(searchPaths.begin(),
-                     saver().save(runtimeLibDirWithOS.str()));
+
+  searchPaths.push_back(saver().save(runtimeLibDirWithOS.str()));
+  searchPaths.push_back(saver().save(runtimeLibDir.str()));
+  searchPaths.push_back(saver().save(libDir.str()));
 }
 
 void LinkerDriver::addWinSysRootLibSearchPaths() {
@@ -978,6 +978,7 @@ std::string LinkerDriver::getImportName(bool asLib) {
 }
 
 void LinkerDriver::createImportLibrary(bool asLib) {
+  llvm::TimeTraceScope timeScope("Create import library");
   std::vector<COFFShortExport> exports;
   for (Export &e1 : ctx.config.exports) {
     COFFShortExport e2;
@@ -1035,6 +1036,7 @@ void LinkerDriver::createImportLibrary(bool asLib) {
 }
 
 void LinkerDriver::parseModuleDefs(StringRef path) {
+  llvm::TimeTraceScope timeScope("Parse def file");
   std::unique_ptr<MemoryBuffer> mb =
       CHECK(MemoryBuffer::getFile(path, /*IsText=*/false,
                                   /*RequiresNullTerminator=*/false,
@@ -1099,6 +1101,7 @@ void LinkerDriver::enqueueTask(std::function<void()> task) {
 }
 
 bool LinkerDriver::run() {
+  llvm::TimeTraceScope timeScope("Read input files");
   ScopedTimer t(ctx.inputFileTimer);
 
   bool didWork = !taskQueue.empty();
@@ -1237,6 +1240,8 @@ static void markAddrsig(Symbol *s) {
 }
 
 static void findKeepUniqueSections(COFFLinkerContext &ctx) {
+  llvm::TimeTraceScope timeScope("Find keep unique sections");
+
   // Exported symbols could be address-significant in other executables or DSOs,
   // so we conservatively mark them as address-significant.
   for (Export &r : ctx.config.exports)
@@ -1329,6 +1334,7 @@ void LinkerDriver::parsePDBAltPath() {
 /// trees into one resource tree.
 /// Call after ObjFile::Instances is complete.
 void LinkerDriver::convertResources() {
+  llvm::TimeTraceScope timeScope("Convert resources");
   std::vector<ObjFile *> resourceObjFiles;
 
   for (ObjFile *f : ctx.objFileInstances) {
@@ -1482,6 +1488,16 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   ArgParser parser(ctx);
   opt::InputArgList args = parser.parse(argsArr);
 
+  // Initialize time trace profiler.
+  config->timeTraceEnabled = args.hasArg(OPT_time_trace_eq);
+  config->timeTraceGranularity =
+      args::getInteger(args, OPT_time_trace_granularity_eq, 500);
+
+  if (config->timeTraceEnabled)
+    timeTraceProfilerInitialize(config->timeTraceGranularity, argsArr[0]);
+
+  llvm::TimeTraceScope timeScope("COFF link");
+
   // Parse and evaluate -mllvm options.
   std::vector<const char *> v;
   v.push_back("lld-link (LLVM option parsing)");
@@ -1489,8 +1505,11 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     v.push_back(arg->getValue());
     config->mllvmOpts.emplace_back(arg->getValue());
   }
-  cl::ResetAllOptionOccurrences();
-  cl::ParseCommandLineOptions(v.size(), v.data());
+  {
+    llvm::TimeTraceScope timeScope2("Parse cl::opt");
+    cl::ResetAllOptionOccurrences();
+    cl::ParseCommandLineOptions(v.size(), v.data());
+  }
 
   // Handle /errorlimit early, because error() depends on it.
   if (auto *arg = args.getLastArg(OPT_errorlimit)) {
@@ -1543,15 +1562,18 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
                                   " (use --error-limit=0 to see all errors)";
 
   // Handle /linkrepro and /reproduce.
-  if (std::optional<std::string> path = getReproduceFile(args)) {
-    Expected<std::unique_ptr<TarWriter>> errOrWriter =
-        TarWriter::create(*path, sys::path::stem(*path));
-
-    if (errOrWriter) {
-      tar = std::move(*errOrWriter);
-    } else {
-      error("/linkrepro: failed to open " + *path + ": " +
-            toString(errOrWriter.takeError()));
+  {
+    llvm::TimeTraceScope timeScope2("Reproducer");
+    if (std::optional<std::string> path = getReproduceFile(args)) {
+      Expected<std::unique_ptr<TarWriter>> errOrWriter =
+          TarWriter::create(*path, sys::path::stem(*path));
+
+      if (errOrWriter) {
+        tar = std::move(*errOrWriter);
+      } else {
+        error("/linkrepro: failed to open " + *path + ": " +
+              toString(errOrWriter.takeError()));
+      }
     }
   }
 
@@ -1563,13 +1585,17 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   }
 
   // Construct search path list.
-  searchPaths.emplace_back("");
-  for (auto *arg : args.filtered(OPT_libpath))
-    searchPaths.push_back(arg->getValue());
-  detectWinSysRoot(args);
-  if (!args.hasArg(OPT_lldignoreenv) && !args.hasArg(OPT_winsysroot))
-    addLibSearchPaths();
-  addClangLibSearchPaths(argsArr[0]);
+  {
+    llvm::TimeTraceScope timeScope2("Search paths");
+    searchPaths.emplace_back("");
+    // Prefer the Clang provided builtins over the ones bundled with MSVC.
+    addClangLibSearchPaths(argsArr[0]);
+    for (auto *arg : args.filtered(OPT_libpath))
+      searchPaths.push_back(arg->getValue());
+    detectWinSysRoot(args);
+    if (!args.hasArg(OPT_lldignoreenv) && !args.hasArg(OPT_winsysroot))
+      addLibSearchPaths();
+  }
 
   // Handle /ignore
   for (auto *arg : args.filtered(OPT_ignore)) {
@@ -1704,16 +1730,22 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
       args.hasFlag(OPT_appcontainer, OPT_appcontainer_no, false);
 
   // Handle /machine
-  if (auto *arg = args.getLastArg(OPT_machine)) {
-    config->machine = getMachineType(arg->getValue());
-    if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN)
-      fatal(Twine("unknown /machine argument: ") + arg->getValue());
-    addWinSysRootLibSearchPaths();
+  {
+    llvm::TimeTraceScope timeScope2("Machine arg");
+    if (auto *arg = args.getLastArg(OPT_machine)) {
+      config->machine = getMachineType(arg->getValue());
+      if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN)
+        fatal(Twine("unknown /machine argument: ") + arg->getValue());
+      addWinSysRootLibSearchPaths();
+    }
   }
 
   // Handle /nodefaultlib:<filename>
-  for (auto *arg : args.filtered(OPT_nodefaultlib))
-    config->noDefaultLibs.insert(findLib(arg->getValue()).lower());
+  {
+    llvm::TimeTraceScope timeScope2("Nodefaultlib");
+    for (auto *arg : args.filtered(OPT_nodefaultlib))
+      config->noDefaultLibs.insert(findLib(arg->getValue()).lower());
+  }
 
   // Handle /nodefaultlib
   if (args.hasArg(OPT_nodefaultlib_all))
@@ -1855,6 +1887,19 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   if (args.hasArg(OPT_lldsavetemps))
     config->saveTemps = true;
 
+  // Handle /lldemit
+  if (auto *arg = args.getLastArg(OPT_lldemit)) {
+    StringRef s = arg->getValue();
+    if (s == "obj")
+      config->emit = EmitKind::Obj;
+    else if (s == "llvm")
+      config->emit = EmitKind::LLVM;
+    else if (s == "asm")
+      config->emit = EmitKind::ASM;
+    else
+      error("/lldemit: unknown option: " + s);
+  }
+
   // Handle /kill-at
   if (args.hasArg(OPT_kill_at))
     config->killAt = true;
@@ -2038,30 +2083,33 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // Create a list of input files. These can be given as OPT_INPUT options
   // and OPT_wholearchive_file options, and we also need to track OPT_start_lib
   // and OPT_end_lib.
-  bool inLib = false;
-  for (auto *arg : args) {
-    switch (arg->getOption().getID()) {
-    case OPT_end_lib:
-      if (!inLib)
-        error("stray " + arg->getSpelling());
-      inLib = false;
-      break;
-    case OPT_start_lib:
-      if (inLib)
-        error("nested " + arg->getSpelling());
-      inLib = true;
-      break;
-    case OPT_wholearchive_file:
-      if (std::optional<StringRef> path = findFileIfNew(arg->getValue()))
-        enqueuePath(*path, true, inLib);
-      break;
-    case OPT_INPUT:
-      if (std::optional<StringRef> path = findFileIfNew(arg->getValue()))
-        enqueuePath(*path, isWholeArchive(*path), inLib);
-      break;
-    default:
-      // Ignore other options.
-      break;
+  {
+    llvm::TimeTraceScope timeScope2("Parse & queue inputs");
+    bool inLib = false;
+    for (auto *arg : args) {
+      switch (arg->getOption().getID()) {
+      case OPT_end_lib:
+        if (!inLib)
+          error("stray " + arg->getSpelling());
+        inLib = false;
+        break;
+      case OPT_start_lib:
+        if (inLib)
+          error("nested " + arg->getSpelling());
+        inLib = true;
+        break;
+      case OPT_wholearchive_file:
+        if (std::optional<StringRef> path = findFileIfNew(arg->getValue()))
+          enqueuePath(*path, true, inLib);
+        break;
+      case OPT_INPUT:
+        if (std::optional<StringRef> path = findFileIfNew(arg->getValue()))
+          enqueuePath(*path, isWholeArchive(*path), inLib);
+        break;
+      default:
+        // Ignore other options.
+        break;
+      }
     }
   }
 
@@ -2084,8 +2132,11 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     raw_svector_ostream stream(buffer);
     stream << "Library search paths:\n";
 
-    for (StringRef path : searchPaths)
+    for (StringRef path : searchPaths) {
+      if (path == "")
+        path = "(cwd)";
       stream << "  " << path << "\n";
+    }
 
     message(buffer);
   }
@@ -2114,6 +2165,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     parseFunctionPadMin(arg);
 
   if (tar) {
+    llvm::TimeTraceScope timeScope("Reproducer: response file");
     tar->append("response.txt",
                 createResponseFile(args, filePaths,
                                    ArrayRef<StringRef>(searchPaths).slice(1)));
@@ -2134,15 +2186,18 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
           machineToStr(config->machine));
 
   // Handle /export
-  for (auto *arg : args.filtered(OPT_export)) {
-    Export e = parseExport(arg->getValue());
-    if (config->machine == I386) {
-      if (!isDecorated(e.name))
-        e.name = saver().save("_" + e.name);
-      if (!e.extName.empty() && !isDecorated(e.extName))
-        e.extName = saver().save("_" + e.extName);
+  {
+    llvm::TimeTraceScope timeScope("Parse /export");
+    for (auto *arg : args.filtered(OPT_export)) {
+      Export e = parseExport(arg->getValue());
+      if (config->machine == I386) {
+        if (!isDecorated(e.name))
+          e.name = saver().save("_" + e.name);
+        if (!e.extName.empty() && !isDecorated(e.extName))
+          e.extName = saver().save("_" + e.extName);
+      }
+      config->exports.push_back(e);
     }
-    config->exports.push_back(e);
   }
 
   // Handle /def
@@ -2163,40 +2218,47 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // that from entry point name.  Must happen before /entry handling,
   // and after the early return when just writing an import library.
   if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) {
+    llvm::TimeTraceScope timeScope("Infer subsystem");
     config->subsystem = inferSubsystem();
     if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN)
       fatal("subsystem must be defined");
   }
 
   // Handle /entry and /dll
-  if (auto *arg = args.getLastArg(OPT_entry)) {
-    config->entry = addUndefined(mangle(arg->getValue()));
-  } else if (!config->entry && !config->noEntry) {
-    if (args.hasArg(OPT_dll)) {
-      StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12"
-                                              : "_DllMainCRTStartup";
-      config->entry = addUndefined(s);
-    } else if (config->driverWdm) {
-      // /driver:wdm implies /entry:_NtProcessStartup
-      config->entry = addUndefined(mangle("_NtProcessStartup"));
-    } else {
-      // Windows specific -- If entry point name is not given, we need to
-      // infer that from user-defined entry name.
-      StringRef s = findDefaultEntry();
-      if (s.empty())
-        fatal("entry point must be defined");
-      config->entry = addUndefined(s);
-      log("Entry name inferred: " + s);
+  {
+    llvm::TimeTraceScope timeScope("Entry point");
+    if (auto *arg = args.getLastArg(OPT_entry)) {
+      config->entry = addUndefined(mangle(arg->getValue()));
+    } else if (!config->entry && !config->noEntry) {
+      if (args.hasArg(OPT_dll)) {
+        StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12"
+                                                : "_DllMainCRTStartup";
+        config->entry = addUndefined(s);
+      } else if (config->driverWdm) {
+        // /driver:wdm implies /entry:_NtProcessStartup
+        config->entry = addUndefined(mangle("_NtProcessStartup"));
+      } else {
+        // Windows specific -- If entry point name is not given, we need to
+        // infer that from user-defined entry name.
+        StringRef s = findDefaultEntry();
+        if (s.empty())
+          fatal("entry point must be defined");
+        config->entry = addUndefined(s);
+        log("Entry name inferred: " + s);
+      }
     }
   }
 
   // Handle /delayload
-  for (auto *arg : args.filtered(OPT_delayload)) {
-    config->delayLoads.insert(StringRef(arg->getValue()).lower());
-    if (config->machine == I386) {
-      config->delayLoadHelper = addUndefined("___delayLoadHelper2@8");
-    } else {
-      config->delayLoadHelper = addUndefined("__delayLoadHelper2");
+  {
+    llvm::TimeTraceScope timeScope("Delay load");
+    for (auto *arg : args.filtered(OPT_delayload)) {
+      config->delayLoads.insert(StringRef(arg->getValue()).lower());
+      if (config->machine == I386) {
+        config->delayLoadHelper = addUndefined("___delayLoadHelper2@8");
+      } else {
+        config->delayLoadHelper = addUndefined("__delayLoadHelper2");
+      }
     }
   }
 
@@ -2290,54 +2352,57 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // This code may add new undefined symbols to the link, which may enqueue more
   // symbol resolution tasks, so we need to continue executing tasks until we
   // converge.
-  do {
-    // Windows specific -- if entry point is not found,
-    // search for its mangled names.
-    if (config->entry)
-      mangleMaybe(config->entry);
-
-    // Windows specific -- Make sure we resolve all dllexported symbols.
-    for (Export &e : config->exports) {
-      if (!e.forwardTo.empty())
-        continue;
-      e.sym = addUndefined(e.name);
-      if (e.source != ExportSource::Directives)
-        e.symbolName = mangleMaybe(e.sym);
-    }
+  {
+    llvm::TimeTraceScope timeScope("Add unresolved symbols");
+    do {
+      // Windows specific -- if entry point is not found,
+      // search for its mangled names.
+      if (config->entry)
+        mangleMaybe(config->entry);
+
+      // Windows specific -- Make sure we resolve all dllexported symbols.
+      for (Export &e : config->exports) {
+        if (!e.forwardTo.empty())
+          continue;
+        e.sym = addUndefined(e.name);
+        if (e.source != ExportSource::Directives)
+          e.symbolName = mangleMaybe(e.sym);
+      }
 
-    // Add weak aliases. Weak aliases is a mechanism to give remaining
-    // undefined symbols final chance to be resolved successfully.
-    for (auto pair : config->alternateNames) {
-      StringRef from = pair.first;
-      StringRef to = pair.second;
-      Symbol *sym = ctx.symtab.find(from);
-      if (!sym)
-        continue;
-      if (auto *u = dyn_cast<Undefined>(sym))
-        if (!u->weakAlias)
-          u->weakAlias = ctx.symtab.addUndefined(to);
-    }
+      // Add weak aliases. Weak aliases is a mechanism to give remaining
+      // undefined symbols final chance to be resolved successfully.
+      for (auto pair : config->alternateNames) {
+        StringRef from = pair.first;
+        StringRef to = pair.second;
+        Symbol *sym = ctx.symtab.find(from);
+        if (!sym)
+          continue;
+        if (auto *u = dyn_cast<Undefined>(sym))
+          if (!u->weakAlias)
+            u->weakAlias = ctx.symtab.addUndefined(to);
+      }
 
-    // If any inputs are bitcode files, the LTO code generator may create
-    // references to library functions that are not explicit in the bitcode
-    // file's symbol table. If any of those library functions are defined in a
-    // bitcode file in an archive member, we need to arrange to use LTO to
-    // compile those archive members by adding them to the link beforehand.
-    if (!ctx.bitcodeFileInstances.empty())
-      for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
-        ctx.symtab.addLibcall(s);
-
-    // Windows specific -- if __load_config_used can be resolved, resolve it.
-    if (ctx.symtab.findUnderscore("_load_config_used"))
-      addUndefined(mangle("_load_config_used"));
-
-    if (args.hasArg(OPT_include_optional)) {
-      // Handle /includeoptional
-      for (auto *arg : args.filtered(OPT_include_optional))
-        if (isa_and_nonnull<LazyArchive>(ctx.symtab.find(arg->getValue())))
-          addUndefined(arg->getValue());
-    }
-  } while (run());
+      // If any inputs are bitcode files, the LTO code generator may create
+      // references to library functions that are not explicit in the bitcode
+      // file's symbol table. If any of those library functions are defined in a
+      // bitcode file in an archive member, we need to arrange to use LTO to
+      // compile those archive members by adding them to the link beforehand.
+      if (!ctx.bitcodeFileInstances.empty())
+        for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
+          ctx.symtab.addLibcall(s);
+
+      // Windows specific -- if __load_config_used can be resolved, resolve it.
+      if (ctx.symtab.findUnderscore("_load_config_used"))
+        addUndefined(mangle("_load_config_used"));
+
+      if (args.hasArg(OPT_include_optional)) {
+        // Handle /includeoptional
+        for (auto *arg : args.filtered(OPT_include_optional))
+          if (isa_and_nonnull<LazyArchive>(ctx.symtab.find(arg->getValue())))
+            addUndefined(arg->getValue());
+      }
+    } while (run());
+  }
 
   // Create wrapped symbols for -wrap option.
   std::vector<WrappedSymbol> wrapped = addWrappedSymbols(ctx, args);
@@ -2395,7 +2460,8 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // If -thinlto-index-only is given, we should create only "index
   // files" and not object files. Index file creation is already done
   // in addCombinedLTOObject, so we are done if that's the case.
-  if (config->thinLTOIndexOnly)
+  // Likewise, don't emit object files for other /lldemit options.
+  if (config->emit != EmitKind::Obj || config->thinLTOIndexOnly)
     return;
 
   // If we generated native object files from bitcode files, this resolves
@@ -2433,6 +2499,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // need to create a .lib file. In MinGW mode, we only do that when the
   // -implib option is given explicitly, for compatibility with GNU ld.
   if (!config->exports.empty() || config->dll) {
+    llvm::TimeTraceScope timeScope("Create .lib exports");
     fixupExports();
     if (!config->noimplib && (!config->mingw || !config->implib.empty()))
       createImportLibrary(/*asLib=*/false);
@@ -2486,6 +2553,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
 
   // Handle /call-graph-ordering-file and /call-graph-profile-sort (default on).
   if (config->callGraphProfileSort) {
+    llvm::TimeTraceScope timeScope("Call graph");
     if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) {
       parseCallGraphFile(arg->getValue());
     }
@@ -2534,6 +2602,15 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   rootTimer.stop();
   if (config->showTiming)
     ctx.rootTimer.print();
+
+  if (config->timeTraceEnabled) {
+    // Manually stop the topmost "COFF link" scope, since we're shutting down.
+    timeTraceProfilerEnd();
+
+    checkError(timeTraceProfilerWrite(
+        args.getLastArgValue(OPT_time_trace_eq).str(), config->outputFile));
+    timeTraceProfilerCleanup();
+  }
 }
 
 } // namespace lld::coff
diff --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp
index fa15f816fb5fcd..583f6af070b625 100644
--- a/lld/COFF/DriverUtils.cpp
+++ b/lld/COFF/DriverUtils.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/WindowsManifest/WindowsManifestMerger.h"
 #include <limits>
@@ -661,6 +662,7 @@ static StringRef exportSourceName(ExportSource s) {
 // Performs error checking on all /export arguments.
 // It also sets ordinals.
 void LinkerDriver::fixupExports() {
+  llvm::TimeTraceScope timeScope("Fixup exports");
   // Symbol ordinals must be unique.
   std::set<uint16_t> ords;
   for (Export &e : ctx.config.exports) {
diff --git a/lld/COFF/ICF.cpp b/lld/COFF/ICF.cpp
index 37f5e7549b7fc0..0f43da0dbc101a 100644
--- a/lld/COFF/ICF.cpp
+++ b/lld/COFF/ICF.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Parallel.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/xxhash.h"
 #include <algorithm>
@@ -246,6 +247,7 @@ void ICF::forEachClass(std::function<void(size_t, size_t)> fn) {
 // Two sections are considered the same if their section headers,
 // contents and relocations are all the same.
 void ICF::run() {
+  llvm::TimeTraceScope timeScope("ICF");
   ScopedTimer t(ctx.icfTimer);
 
   // Collect only mergeable sections and group by hash value.
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 541837a7fceca7..a7a08fb2fa6ea5 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -661,6 +661,8 @@ std::optional<Symbol *> ObjFile::createDefined(
     if (prevailing) {
       SectionChunk *c = readSection(sectionNumber, def, getName());
       sparseChunks[sectionNumber] = c;
+      if (!c)
+        return nullptr;
       c->sym = cast<DefinedRegular>(leader);
       c->selection = selection;
       cast<DefinedRegular>(leader)->data = &c->repl;
diff --git a/lld/COFF/LLDMapFile.cpp b/lld/COFF/LLDMapFile.cpp
index c14480aaf821af..58098cf5d6528d 100644
--- a/lld/COFF/LLDMapFile.cpp
+++ b/lld/COFF/LLDMapFile.cpp
@@ -25,6 +25,7 @@
 #include "Writer.h"
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/Support/Parallel.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -92,6 +93,7 @@ void lld::coff::writeLLDMapFile(const COFFLinkerContext &ctx) {
   if (ctx.config.lldmapFile.empty())
     return;
 
+  llvm::TimeTraceScope timeScope(".lldmap file");
   std::error_code ec;
   raw_fd_ostream os(ctx.config.lldmapFile, ec, sys::fs::OF_None);
   if (ec)
diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp
index 6ca000b466a126..7df93191121367 100644
--- a/lld/COFF/LTO.cpp
+++ b/lld/COFF/LTO.cpp
@@ -86,6 +86,20 @@ lto::Config BitcodeCompiler::createConfig() {
   c.CSIRProfile = std::string(ctx.config.ltoCSProfileFile);
   c.RunCSIRInstr = ctx.config.ltoCSProfileGenerate;
   c.PGOWarnMismatch = ctx.config.ltoPGOWarnMismatch;
+  c.TimeTraceEnabled = ctx.config.timeTraceEnabled;
+  c.TimeTraceGranularity = ctx.config.timeTraceGranularity;
+
+  if (ctx.config.emit == EmitKind::LLVM) {
+    c.PostInternalizeModuleHook = [this](size_t task, const Module &m) {
+      if (std::unique_ptr<raw_fd_ostream> os =
+              openLTOOutputFile(ctx.config.outputFile))
+        WriteBitcodeToFile(m, *os, false);
+      return false;
+    };
+  } else if (ctx.config.emit == EmitKind::ASM) {
+    c.CGFileType = CodeGenFileType::AssemblyFile;
+    c.Options.MCOptions.AsmVerbose = true;
+  }
 
   if (ctx.config.saveTemps)
     checkError(c.addSaveTemps(std::string(ctx.config.outputFile) + ".",
@@ -204,6 +218,8 @@ std::vector<InputFile *> BitcodeCompiler::compile() {
     pruneCache(ctx.config.ltoCache, ctx.config.ltoCachePolicy, files);
 
   std::vector<InputFile *> ret;
+  bool emitASM = ctx.config.emit == EmitKind::ASM;
+  const char *Ext = emitASM ? ".s" : ".obj";
   for (unsigned i = 0; i != maxTasks; ++i) {
     StringRef bitcodeFilePath;
     // Get the native object contents either from the cache or from memory.  Do
@@ -226,20 +242,21 @@ std::vector<InputFile *> BitcodeCompiler::compile() {
     if (bitcodeFilePath == "ld-temp.o") {
       ltoObjName =
           saver().save(Twine(ctx.config.outputFile) + ".lto" +
-                       (i == 0 ? Twine("") : Twine('.') + Twine(i)) + ".obj");
+                       (i == 0 ? Twine("") : Twine('.') + Twine(i)) + Ext);
     } else {
       StringRef directory = sys::path::parent_path(bitcodeFilePath);
-      StringRef baseName = sys::path::filename(bitcodeFilePath);
+      StringRef baseName = sys::path::stem(bitcodeFilePath);
       StringRef outputFileBaseName = sys::path::filename(ctx.config.outputFile);
       SmallString<64> path;
       sys::path::append(path, directory,
-                        outputFileBaseName + ".lto." + baseName);
+                        outputFileBaseName + ".lto." + baseName + Ext);
       sys::path::remove_dots(path, true);
       ltoObjName = saver().save(path.str());
     }
-    if (ctx.config.saveTemps)
+    if (ctx.config.saveTemps || emitASM)
       saveBuffer(buf[i].second, ltoObjName);
-    ret.push_back(make<ObjFile>(ctx, MemoryBufferRef(objBuf, ltoObjName)));
+    if (!emitASM)
+      ret.push_back(make<ObjFile>(ctx, MemoryBufferRef(objBuf, ltoObjName)));
   }
 
   return ret;
diff --git a/lld/COFF/MapFile.cpp b/lld/COFF/MapFile.cpp
index f7a4ef9612907d..ed521dd375ed01 100644
--- a/lld/COFF/MapFile.cpp
+++ b/lld/COFF/MapFile.cpp
@@ -36,6 +36,7 @@
 #include "lld/Common/Timer.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -203,6 +204,7 @@ void lld::coff::writeMapFile(COFFLinkerContext &ctx) {
   if (ctx.config.mapFile.empty())
     return;
 
+  llvm::TimeTraceScope timeScope("Map file");
   std::error_code ec;
   raw_fd_ostream os(ctx.config.mapFile, ec, sys::fs::OF_None);
   if (ec)
diff --git a/lld/COFF/MarkLive.cpp b/lld/COFF/MarkLive.cpp
index ad8c340f184517..2cf216a6aaad56 100644
--- a/lld/COFF/MarkLive.cpp
+++ b/lld/COFF/MarkLive.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "lld/Common/Timer.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/TimeProfiler.h"
 #include <vector>
 
 namespace lld::coff {
@@ -19,6 +20,7 @@ namespace lld::coff {
 // COMDAT chunks will be ignored by Writer, so they will be excluded
 // from the final output.
 void markLive(COFFLinkerContext &ctx) {
+  llvm::TimeTraceScope timeScope("Mark live");
   ScopedTimer t(ctx.gcTimer);
 
   // We build up a worklist of sections which have been marked as live. We only
diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
index 2eae92906db95b..e46f5277a8c365 100644
--- a/lld/COFF/MinGW.cpp
+++ b/lld/COFF/MinGW.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -50,7 +51,8 @@ AutoExporter::AutoExporter(
       "libc++",
       "libc++abi",
       "libFortran_main",
-      "libflang-rt",
+      "libFortranRuntime",
+      "libFortranDecimal",
       "libunwind",
       "libmsvcrt",
       "libucrtbase",
@@ -171,6 +173,7 @@ bool AutoExporter::shouldExport(Defined *sym) const {
 
 void lld::coff::writeDefFile(StringRef name,
                              const std::vector<Export> &exports) {
+  llvm::TimeTraceScope timeScope("Write .def file");
   std::error_code ec;
   raw_fd_ostream os(name, ec, sys::fs::OF_None);
   if (ec)
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index ea4ddb2d849534..e7bb952808863b 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -232,6 +232,7 @@ defm demangle : B<"demangle",
 def include_optional : Joined<["/", "-", "/?", "-?"], "includeoptional:">,
     HelpText<"Add symbol as undefined, but allow it to remain undefined">;
 def kill_at : F<"kill-at">;
+def lldemit : P<"lldemit", "Specify output type">;
 def lldmingw : F<"lldmingw">;
 def noseh : F<"noseh">;
 def osversion : P_priv<"osversion">;
@@ -289,6 +290,14 @@ def wrap : P_priv<"wrap">;
 
 def vfsoverlay : P<"vfsoverlay", "Path to a vfsoverlay yaml file to optionally look for /defaultlib's in">;
 
+def time_trace_eq: Joined<["--"], "time-trace=">, MetaVarName<"<file>">,
+  HelpText<"Record time trace to <file>">;
+def : Flag<["--"], "time-trace">, Alias<time_trace_eq>,
+  HelpText<"Record time trace to file next to output">;
+
+def time_trace_granularity_eq: Joined<["--"], "time-trace-granularity=">,
+    HelpText<"Minimum time granularity (in microseconds) traced by time profiler">;
+
 // Flags for debugging
 def lldmap : F<"lldmap">;
 def lldmap_file : P_priv<"lldmap">;
diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
index 5aa81c1fd03bbc..0c4e0a80cf9a14 100644
--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -57,6 +57,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/TimeProfiler.h"
 #include <memory>
 #include <optional>
 
@@ -187,9 +188,6 @@ class DebugSHandler {
   /// The object file whose .debug$S sections we're processing.
   ObjFile &file;
 
-  /// The result of merging type indices.
-  TpiSource *source;
-
   /// The DEBUG_S_STRINGTABLE subsection.  These strings are referred to by
   /// index from other records in the .debug$S section.  All of these strings
   /// need to be added to the global PDB string table, and all references to
@@ -230,11 +228,9 @@ class DebugSHandler {
   void addFrameDataSubsection(SectionChunk *debugChunk,
                               const DebugSubsectionRecord &ss);
 
-  void recordStringTableReferences(CVSymbol sym, uint32_t symOffset);
-
 public:
-  DebugSHandler(PDBLinker &linker, ObjFile &file, TpiSource *source)
-      : linker(linker), file(file), source(source) {}
+  DebugSHandler(PDBLinker &linker, ObjFile &file)
+      : linker(linker), file(file) {}
 
   void handleDebugS(SectionChunk *debugChunk);
 
@@ -1032,10 +1028,11 @@ void PDBLinker::addDebugSymbols(TpiSource *source) {
   if (!source->file)
     return;
 
+  llvm::TimeTraceScope timeScope("Merge symbols");
   ScopedTimer t(ctx.symbolMergingTimer);
   ExitOnError exitOnErr;
   pdb::DbiStreamBuilder &dbiBuilder = builder.getDbiBuilder();
-  DebugSHandler dsh(*this, *source->file, source);
+  DebugSHandler dsh(*this, *source->file);
   // Now do all live .debug$S and .debug$F sections.
   for (SectionChunk *debugChunk : source->file->getDebugChunks()) {
     if (!debugChunk->live || debugChunk->getSize() == 0)
@@ -1106,6 +1103,7 @@ void PDBLinker::addDebug(TpiSource *source) {
   // indices to PDB type and item indices.  If we are using ghashes, types have
   // already been merged.
   if (!ctx.config.debugGHashes) {
+    llvm::TimeTraceScope timeScope("Merge types (Non-GHASH)");
     ScopedTimer t(ctx.typeMergingTimer);
     if (Error e = source->mergeDebugT(&tMerger)) {
       // If type merging failed, ignore the symbols.
@@ -1150,39 +1148,49 @@ static pdb::BulkPublic createPublic(COFFLinkerContext &ctx, Defined *def) {
 // Add all object files to the PDB. Merge .debug$T sections into IpiData and
 // TpiData.
 void PDBLinker::addObjectsToPDB() {
-  ScopedTimer t1(ctx.addObjectsTimer);
-
-  // Create module descriptors
-  for (ObjFile *obj : ctx.objFileInstances)
-    createModuleDBI(obj);
-
-  // Reorder dependency type sources to come first.
-  tMerger.sortDependencies();
-
-  // Merge type information from input files using global type hashing.
-  if (ctx.config.debugGHashes)
-    tMerger.mergeTypesWithGHash();
-
-  // Merge dependencies and then regular objects.
-  for (TpiSource *source : tMerger.dependencySources)
-    addDebug(source);
-  for (TpiSource *source : tMerger.objectSources)
-    addDebug(source);
+  {
+    llvm::TimeTraceScope timeScope("Add objects to PDB");
+    ScopedTimer t1(ctx.addObjectsTimer);
+
+    // Create module descriptors
+    for (ObjFile *obj : ctx.objFileInstances)
+      createModuleDBI(obj);
+
+    // Reorder dependency type sources to come first.
+    tMerger.sortDependencies();
+
+    // Merge type information from input files using global type hashing.
+    if (ctx.config.debugGHashes)
+      tMerger.mergeTypesWithGHash();
+
+    // Merge dependencies and then regular objects.
+    {
+      llvm::TimeTraceScope timeScope("Merge debug info (dependencies)");
+      for (TpiSource *source : tMerger.dependencySources)
+        addDebug(source);
+    }
+    {
+      llvm::TimeTraceScope timeScope("Merge debug info (objects)");
+      for (TpiSource *source : tMerger.objectSources)
+        addDebug(source);
+    }
 
-  builder.getStringTableBuilder().setStrings(pdbStrTab);
-  t1.stop();
+    builder.getStringTableBuilder().setStrings(pdbStrTab);
+  }
 
   // Construct TPI and IPI stream contents.
-  ScopedTimer t2(ctx.tpiStreamLayoutTimer);
+  {
+    llvm::TimeTraceScope timeScope("TPI/IPI stream layout");
+    ScopedTimer t2(ctx.tpiStreamLayoutTimer);
 
-  // Collect all the merged types.
-  if (ctx.config.debugGHashes) {
-    addGHashTypeInfo(ctx, builder);
-  } else {
-    addTypeInfo(builder.getTpiBuilder(), tMerger.getTypeTable());
-    addTypeInfo(builder.getIpiBuilder(), tMerger.getIDTable());
+    // Collect all the merged types.
+    if (ctx.config.debugGHashes) {
+      addGHashTypeInfo(ctx, builder);
+    } else {
+      addTypeInfo(builder.getTpiBuilder(), tMerger.getTypeTable());
+      addTypeInfo(builder.getIpiBuilder(), tMerger.getIDTable());
+    }
   }
-  t2.stop();
 
   if (ctx.config.showSummary) {
     for (TpiSource *source : ctx.tpiSourceList) {
@@ -1193,6 +1201,7 @@ void PDBLinker::addObjectsToPDB() {
 }
 
 void PDBLinker::addPublicsToPDB() {
+  llvm::TimeTraceScope timeScope("Publics layout");
   ScopedTimer t3(ctx.publicsLayoutTimer);
   // Compute the public symbols.
   auto &gsiBuilder = builder.getGsiBuilder();
@@ -1311,6 +1320,7 @@ void PDBLinker::printStats() {
 }
 
 void PDBLinker::addNatvisFiles() {
+  llvm::TimeTraceScope timeScope("Natvis files");
   for (StringRef file : ctx.config.natvisFiles) {
     ErrorOr<std::unique_ptr<MemoryBuffer>> dataOrErr =
         MemoryBuffer::getFile(file);
@@ -1330,6 +1340,7 @@ void PDBLinker::addNatvisFiles() {
 }
 
 void PDBLinker::addNamedStreams() {
+  llvm::TimeTraceScope timeScope("Named streams");
   ExitOnError exitOnErr;
   for (const auto &streamFile : ctx.config.namedStreams) {
     const StringRef stream = streamFile.getKey(), file = streamFile.getValue();
@@ -1504,6 +1515,7 @@ void PDBLinker::addImportFilesToPDB() {
   if (ctx.importFileInstances.empty())
     return;
 
+  llvm::TimeTraceScope timeScope("Import files");
   ExitOnError exitOnErr;
   std::map<std::string, llvm::pdb::DbiModuleDescriptorBuilder *> dllToModuleDbi;
 
@@ -1593,25 +1605,37 @@ void PDBLinker::addImportFilesToPDB() {
 void lld::coff::createPDB(COFFLinkerContext &ctx,
                           ArrayRef<uint8_t> sectionTable,
                           llvm::codeview::DebugInfo *buildId) {
+  llvm::TimeTraceScope timeScope("PDB file");
   ScopedTimer t1(ctx.totalPdbLinkTimer);
-  PDBLinker pdb(ctx);
-
-  pdb.initialize(buildId);
-  pdb.addObjectsToPDB();
-  pdb.addImportFilesToPDB();
-  pdb.addSections(sectionTable);
-  pdb.addNatvisFiles();
-  pdb.addNamedStreams();
-  pdb.addPublicsToPDB();
-
-  ScopedTimer t2(ctx.diskCommitTimer);
-  codeview::GUID guid;
-  pdb.commit(&guid);
-  memcpy(&buildId->PDB70.Signature, &guid, 16);
-
-  t2.stop();
-  t1.stop();
-  pdb.printStats();
+  {
+    PDBLinker pdb(ctx);
+
+    pdb.initialize(buildId);
+    pdb.addObjectsToPDB();
+    pdb.addImportFilesToPDB();
+    pdb.addSections(sectionTable);
+    pdb.addNatvisFiles();
+    pdb.addNamedStreams();
+    pdb.addPublicsToPDB();
+
+    {
+      llvm::TimeTraceScope timeScope("Commit PDB file to disk");
+      ScopedTimer t2(ctx.diskCommitTimer);
+      codeview::GUID guid;
+      pdb.commit(&guid);
+      memcpy(&buildId->PDB70.Signature, &guid, 16);
+    }
+
+    t1.stop();
+    pdb.printStats();
+
+    // Manually start this profile point to measure ~PDBLinker().
+    if (getTimeTraceProfilerInstance() != nullptr)
+      timeTraceProfilerBegin("PDBLinker destructor", StringRef(""));
+  }
+  // Manually end this profile point to measure ~PDBLinker().
+  if (getTimeTraceProfilerInstance() != nullptr)
+    timeTraceProfilerEnd();
 }
 
 void PDBLinker::initialize(llvm::codeview::DebugInfo *buildId) {
@@ -1646,6 +1670,7 @@ void PDBLinker::initialize(llvm::codeview::DebugInfo *buildId) {
 }
 
 void PDBLinker::addSections(ArrayRef<uint8_t> sectionTable) {
+  llvm::TimeTraceScope timeScope("PDB output sections");
   ExitOnError exitOnErr;
   // It's not entirely clear what this is, but the * Linker * module uses it.
   pdb::DbiStreamBuilder &dbiBuilder = builder.getDbiBuilder();
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index b7217d14391a2c..d9673e159769fb 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -473,6 +473,7 @@ void SymbolTable::reportUnresolvable() {
 }
 
 void SymbolTable::resolveRemainingUndefines() {
+  llvm::TimeTraceScope timeScope("Resolve remaining undefined symbols");
   SmallPtrSet<Symbol *, 8> undefs;
   DenseMap<Symbol *, Symbol *> localImports;
 
@@ -878,6 +879,7 @@ void SymbolTable::compileBitcodeFiles() {
   if (ctx.bitcodeFileInstances.empty())
     return;
 
+  llvm::TimeTraceScope timeScope("Compile bitcode");
   ScopedTimer t(ctx.ltoTimer);
   lto.reset(new BitcodeCompiler(ctx));
   for (BitcodeFile *f : ctx.bitcodeFileInstances)
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index c9b6318fe5212b..4f6c2a57f53350 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RandomNumberGenerator.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/xxhash.h"
 #include <algorithm>
 #include <cstdio>
@@ -320,7 +321,10 @@ class Writer {
 };
 } // anonymous namespace
 
-void lld::coff::writeResult(COFFLinkerContext &ctx) { Writer(ctx).run(); }
+void lld::coff::writeResult(COFFLinkerContext &ctx) {
+  llvm::TimeTraceScope timeScope("Write output(s)");
+  Writer(ctx).run();
+}
 
 void OutputSection::addChunk(Chunk *c) {
   chunks.push_back(c);
@@ -563,16 +567,21 @@ void Writer::finalizeAddresses() {
   int pass = 0;
   int margin = 1024 * 100;
   while (true) {
+    llvm::TimeTraceScope timeScope2("Add thunks pass");
+
     // First check whether we need thunks at all, or if the previous pass of
     // adding them turned out ok.
     bool rangesOk = true;
     size_t numChunks = 0;
-    for (OutputSection *sec : ctx.outputSections) {
-      if (!verifyRanges(sec->chunks)) {
-        rangesOk = false;
-        break;
+    {
+      llvm::TimeTraceScope timeScope3("Verify ranges");
+      for (OutputSection *sec : ctx.outputSections) {
+        if (!verifyRanges(sec->chunks)) {
+          rangesOk = false;
+          break;
+        }
+        numChunks += sec->chunks.size();
       }
-      numChunks += sec->chunks.size();
     }
     if (rangesOk) {
       if (pass > 0)
@@ -596,8 +605,11 @@ void Writer::finalizeAddresses() {
     // Try adding thunks everywhere where it is needed, with a margin
     // to avoid things going out of range due to the added thunks.
     bool addressesChanged = false;
-    for (OutputSection *sec : ctx.outputSections)
-      addressesChanged |= createThunks(sec, margin);
+    {
+      llvm::TimeTraceScope timeScope3("Create thunks");
+      for (OutputSection *sec : ctx.outputSections)
+        addressesChanged |= createThunks(sec, margin);
+    }
     // If the verification above thought we needed thunks, we should have
     // added some.
     assert(addressesChanged);
@@ -616,6 +628,8 @@ void Writer::writePEChecksum() {
     return;
   }
 
+  llvm::TimeTraceScope timeScope("PE checksum");
+
   // https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#checksum
   uint32_t *buf = (uint32_t *)buffer->getBufferStart();
   uint32_t size = (uint32_t)(buffer->getBufferSize());
@@ -650,42 +664,44 @@ void Writer::writePEChecksum() {
 
 // The main function of the writer.
 void Writer::run() {
-  ScopedTimer t1(ctx.codeLayoutTimer);
-
-  createImportTables();
-  createSections();
-  appendImportThunks();
-  // Import thunks must be added before the Control Flow Guard tables are added.
-  createMiscChunks();
-  createExportTable();
-  mergeSections();
-  removeUnusedSections();
-  finalizeAddresses();
-  removeEmptySections();
-  assignOutputSectionIndices();
-  setSectionPermissions();
-  createSymbolAndStringTable();
-
-  if (fileSize > UINT32_MAX)
-    fatal("image size (" + Twine(fileSize) + ") " +
-        "exceeds maximum allowable size (" + Twine(UINT32_MAX) + ")");
-
-  openFile(ctx.config.outputFile);
-  if (ctx.config.is64()) {
-    writeHeader<pe32plus_header>();
-  } else {
-    writeHeader<pe32_header>();
-  }
-  writeSections();
-  checkLoadConfig();
-  sortExceptionTable();
-
-  // Fix up the alignment in the TLS Directory's characteristic field,
-  // if a specific alignment value is needed
-  if (tlsAlignment)
-    fixTlsAlignment();
+  {
+    llvm::TimeTraceScope timeScope("Write PE");
+    ScopedTimer t1(ctx.codeLayoutTimer);
+
+    createImportTables();
+    createSections();
+    appendImportThunks();
+    // Import thunks must be added before the Control Flow Guard tables are
+    // added.
+    createMiscChunks();
+    createExportTable();
+    mergeSections();
+    removeUnusedSections();
+    finalizeAddresses();
+    removeEmptySections();
+    assignOutputSectionIndices();
+    setSectionPermissions();
+    createSymbolAndStringTable();
+
+    if (fileSize > UINT32_MAX)
+      fatal("image size (" + Twine(fileSize) + ") " +
+            "exceeds maximum allowable size (" + Twine(UINT32_MAX) + ")");
+
+    openFile(ctx.config.outputFile);
+    if (ctx.config.is64()) {
+      writeHeader<pe32plus_header>();
+    } else {
+      writeHeader<pe32_header>();
+    }
+    writeSections();
+    checkLoadConfig();
+    sortExceptionTable();
 
-  t1.stop();
+    // Fix up the alignment in the TLS Directory's characteristic field,
+    // if a specific alignment value is needed
+    if (tlsAlignment)
+      fixTlsAlignment();
+  }
 
   if (!ctx.config.pdbPath.empty() && ctx.config.debug) {
     assert(buildId);
@@ -701,6 +717,7 @@ void Writer::run() {
   if (errorCount())
     return;
 
+  llvm::TimeTraceScope timeScope("Commit PE to disk");
   ScopedTimer t2(ctx.outputCommitTimer);
   if (auto e = buffer->commit())
     fatal("failed to write output '" + buffer->getPath() +
@@ -878,6 +895,7 @@ void Writer::sortSections() {
 
 // Create output section objects and add them to OutputSections.
 void Writer::createSections() {
+  llvm::TimeTraceScope timeScope("Output sections");
   // First, create the builtin sections.
   const uint32_t data = IMAGE_SCN_CNT_INITIALIZED_DATA;
   const uint32_t bss = IMAGE_SCN_CNT_UNINITIALIZED_DATA;
@@ -1003,6 +1021,7 @@ void Writer::createSections() {
 }
 
 void Writer::createMiscChunks() {
+  llvm::TimeTraceScope timeScope("Misc chunks");
   Configuration *config = &ctx.config;
 
   for (MergeChunk *p : ctx.mergeChunkInstances) {
@@ -1068,6 +1087,7 @@ void Writer::createMiscChunks() {
 // IdataContents class abstracted away the details for us,
 // so we just let it create chunks and add them to the section.
 void Writer::createImportTables() {
+  llvm::TimeTraceScope timeScope("Import tables");
   // Initialize DLLOrder so that import entries are ordered in
   // the same order as in the command line. (That affects DLL
   // initialization order, and this ordering is MSVC-compatible.)
@@ -1097,6 +1117,7 @@ void Writer::appendImportThunks() {
   if (ctx.importFileInstances.empty())
     return;
 
+  llvm::TimeTraceScope timeScope("Import thunks");
   for (ImportFile *file : ctx.importFileInstances) {
     if (!file->live)
       continue;
@@ -1128,6 +1149,7 @@ void Writer::appendImportThunks() {
 }
 
 void Writer::createExportTable() {
+  llvm::TimeTraceScope timeScope("Export table");
   if (!edataSec->chunks.empty()) {
     // Allow using a custom built export table from input object files, instead
     // of having the linker synthesize the tables.
@@ -1148,6 +1170,7 @@ void Writer::createExportTable() {
 }
 
 void Writer::removeUnusedSections() {
+  llvm::TimeTraceScope timeScope("Remove unused sections");
   // Remove sections that we can be sure won't get content, to avoid
   // allocating space for their section headers.
   auto isUnused = [this](OutputSection *s) {
@@ -1163,11 +1186,13 @@ void Writer::removeUnusedSections() {
 // The Windows loader doesn't seem to like empty sections,
 // so we remove them if any.
 void Writer::removeEmptySections() {
+  llvm::TimeTraceScope timeScope("Remove empty sections");
   auto isEmpty = [](OutputSection *s) { return s->getVirtualSize() == 0; };
   llvm::erase_if(ctx.outputSections, isEmpty);
 }
 
 void Writer::assignOutputSectionIndices() {
+  llvm::TimeTraceScope timeScope("Output sections indices");
   // Assign final output section indices, and assign each chunk to its output
   // section.
   uint32_t idx = 1;
@@ -1258,6 +1283,7 @@ std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {
 }
 
 void Writer::createSymbolAndStringTable() {
+  llvm::TimeTraceScope timeScope("Symbol and string table");
   // PE/COFF images are limited to 8 byte section names. Longer names can be
   // supported by writing a non-standard string table, but this string table is
   // not mapped at runtime and the long names will therefore be inaccessible.
@@ -1320,6 +1346,7 @@ void Writer::createSymbolAndStringTable() {
 }
 
 void Writer::mergeSections() {
+  llvm::TimeTraceScope timeScope("Merge sections");
   if (!pdataSec->chunks.empty()) {
     firstPdata = pdataSec->chunks.front();
     lastPdata = pdataSec->chunks.back();
@@ -1353,6 +1380,7 @@ void Writer::mergeSections() {
 // Visits all sections to assign incremental, non-overlapping RVAs and
 // file offsets.
 void Writer::assignAddresses() {
+  llvm::TimeTraceScope timeScope("Assign addresses");
   Configuration *config = &ctx.config;
 
   sizeOfHeaders = dosStubSize + sizeof(PEMagic) + sizeof(coff_file_header) +
@@ -1367,6 +1395,7 @@ void Writer::assignAddresses() {
   uint64_t rva = alignTo(sizeOfHeaders, config->align);
 
   for (OutputSection *sec : ctx.outputSections) {
+    llvm::TimeTraceScope timeScope("Section: ", sec->name);
     if (sec == relocSec)
       addBaserels();
     uint64_t rawSize = 0, virtualSize = 0;
@@ -1947,6 +1976,7 @@ void Writer::insertCtorDtorSymbols() {
 // Handles /section options to allow users to overwrite
 // section attributes.
 void Writer::setSectionPermissions() {
+  llvm::TimeTraceScope timeScope("Sections permissions");
   for (auto &p : ctx.config.section) {
     StringRef name = p.first;
     uint32_t perm = p.second;
@@ -1958,6 +1988,7 @@ void Writer::setSectionPermissions() {
 
 // Write section contents to a mmap'ed file.
 void Writer::writeSections() {
+  llvm::TimeTraceScope timeScope("Write sections");
   uint8_t *buf = buffer->getBufferStart();
   for (OutputSection *sec : ctx.outputSections) {
     uint8_t *secBuf = buf + sec->getFileOff();
@@ -1974,6 +2005,8 @@ void Writer::writeSections() {
 }
 
 void Writer::writeBuildId() {
+  llvm::TimeTraceScope timeScope("Write build ID");
+
   // There are two important parts to the build ID.
   // 1) If building with debug info, the COFF debug directory contains a
   //    timestamp as well as a Guid and Age of the PDB.
@@ -2030,6 +2063,7 @@ void Writer::writeBuildId() {
 void Writer::sortExceptionTable() {
   if (!firstPdata)
     return;
+  llvm::TimeTraceScope timeScope("Sort exception table");
   // We assume .pdata contains function table entries only.
   auto bufAddr = [&](Chunk *c) {
     OutputSection *os = ctx.getOutputSection(c);
@@ -2123,6 +2157,7 @@ void Writer::addBaserels() {
   for (OutputSection *sec : ctx.outputSections) {
     if (sec->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE)
       continue;
+    llvm::TimeTraceScope timeScope("Base relocations: ", sec->name);
     // Collect all locations for base relocations.
     for (Chunk *c : sec->chunks)
       c->getBaserels(&v);
diff --git a/lld/Common/Filesystem.cpp b/lld/Common/Filesystem.cpp
index 1fc19ef0278860..c2d3644191c9f4 100644
--- a/lld/Common/Filesystem.cpp
+++ b/lld/Common/Filesystem.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TimeProfiler.h"
 #if LLVM_ON_UNIX
 #include <unistd.h>
 #endif
@@ -122,6 +123,7 @@ void lld::unlinkAsync(StringRef path) {
 // is called. We use that class without calling commit() to predict
 // if the given file is writable.
 std::error_code lld::tryCreateFile(StringRef path) {
+  llvm::TimeTraceScope timeScope("Try create output file");
   if (path.empty())
     return std::error_code();
   if (path == "-")
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 174a0a3624f776..048f0ec30ebd28 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -770,6 +770,8 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   uint64_t secAddr = sec.getOutputSection()->addr;
   if (auto *s = dyn_cast<InputSection>(&sec))
     secAddr += s->outSecOff;
+  else if (auto *ehIn = dyn_cast<EhInputSection>(&sec))
+    secAddr += ehIn->getParent()->outSecOff;
   AArch64Relaxer relaxer(sec.relocs());
   for (size_t i = 0, size = sec.relocs().size(); i != size; ++i) {
     const Relocation &rel = sec.relocs()[i];
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 969d9326a7fc96..097a57514770aa 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -1563,6 +1563,8 @@ void PPC64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   uint64_t secAddr = sec.getOutputSection()->addr;
   if (auto *s = dyn_cast<InputSection>(&sec))
     secAddr += s->outSecOff;
+  else if (auto *ehIn = dyn_cast<EhInputSection>(&sec))
+    secAddr += ehIn->getParent()->outSecOff;
   uint64_t lastPPCRelaxedRelocOff = -1;
   for (const Relocation &rel : sec.relocs()) {
     uint8_t *loc = buf + rel.offset;
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index 349ccd218a579e..2135ac23486451 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "OutputSections.h"
+#include "Relocations.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -47,6 +49,7 @@ class X86_64 : public TargetInfo {
                                         uint8_t stOther) const override;
   bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
                              InputSection *nextIS) const override;
+  bool relaxOnce(int pass) const override;
 };
 } // namespace
 
@@ -305,6 +308,43 @@ bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
   return true;
 }
 
+bool X86_64::relaxOnce(int pass) const {
+  uint64_t minVA = UINT64_MAX, maxVA = 0;
+  for (OutputSection *osec : outputSections) {
+    minVA = std::min(minVA, osec->addr);
+    maxVA = std::max(maxVA, osec->addr + osec->size);
+  }
+  // If the max VA difference is under 2^31, GOT-generating relocations with a 32-bit range cannot overflow.
+  if (isUInt<31>(maxVA - minVA))
+    return false;
+
+  SmallVector<InputSection *, 0> storage;
+  bool changed = false;
+  for (OutputSection *osec : outputSections) {
+    if (!(osec->flags & SHF_EXECINSTR))
+      continue;
+    for (InputSection *sec : getInputSections(*osec, storage)) {
+      for (Relocation &rel : sec->relocs()) {
+        if (rel.expr != R_RELAX_GOT_PC)
+          continue;
+
+        uint64_t v = sec->getRelocTargetVA(
+            sec->file, rel.type, rel.addend,
+            sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr);
+        if (isInt<32>(v))
+          continue;
+        if (rel.sym->auxIdx == 0) {
+          rel.sym->allocateAux();
+          addGotEntry(*rel.sym);
+          changed = true;
+        }
+        rel.expr = R_GOT_PC;
+      }
+    }
+  }
+  return changed;
+}
+
 RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
                            const uint8_t *loc) const {
   switch (type) {
@@ -912,7 +952,8 @@ static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op,
 }
 
 static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) {
-  checkInt(loc, val, 32, rel);
+  assert(isInt<32>(val) &&
+         "GOTPCRELX should not have been relaxed if it overflows");
   const uint8_t op = loc[-2];
   const uint8_t modRm = loc[-1];
 
@@ -989,6 +1030,8 @@ void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   uint64_t secAddr = sec.getOutputSection()->addr;
   if (auto *s = dyn_cast<InputSection>(&sec))
     secAddr += s->outSecOff;
+  else if (auto *ehIn = dyn_cast<EhInputSection>(&sec))
+    secAddr += ehIn->getParent()->outSecOff;
   for (const Relocation &rel : sec.relocs()) {
     if (rel.expr == R_NONE) // See deleteFallThruJmpInsn
       continue;
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index fa0731854689b9..a2c83adc18f54f 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -40,6 +40,13 @@ using namespace llvm::support::endian;
 using namespace lld;
 using namespace lld::elf;
 
+// This function is explicity instantiated in ARM.cpp, don't do it here to avoid
+// warnings with MSVC.
+extern template void ObjFile<ELF32LE>::importCmseSymbols();
+extern template void ObjFile<ELF32BE>::importCmseSymbols();
+extern template void ObjFile<ELF64LE>::importCmseSymbols();
+extern template void ObjFile<ELF64BE>::importCmseSymbols();
+
 bool InputFile::isInGroup;
 uint32_t InputFile::nextGroupId;
 
@@ -315,6 +322,13 @@ template <class ELFT> static void doParseFile(InputFile *file) {
 // Add symbols in File to the symbol table.
 void elf::parseFile(InputFile *file) { invokeELFT(doParseFile, file); }
 
+// This function is explicity instantiated in ARM.cpp. Mark it extern here,
+// to avoid warnings when building with MSVC.
+extern template void ObjFile<ELF32LE>::importCmseSymbols();
+extern template void ObjFile<ELF32BE>::importCmseSymbols();
+extern template void ObjFile<ELF64LE>::importCmseSymbols();
+extern template void ObjFile<ELF64BE>::importCmseSymbols();
+
 template <class ELFT> static void doParseArmCMSEImportLib(InputFile *file) {
   cast<ObjFile<ELFT>>(file)->importCmseSymbols();
 }
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index cc658bdc231988..41c2ba4e307b36 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -289,7 +289,6 @@ template <class ELFT> class ObjFile : public ELFFileBase {
   void initSectionsAndLocalSyms(bool ignoreComdats);
   void postParse();
   void importCmseSymbols();
-  void redirectCmseSymbols();
 
 private:
   void initializeSections(bool ignoreComdats,
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 9305c959d0f0f6..ee27cc15e040a4 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -904,7 +904,7 @@ static void addPltEntry(PltSection &plt, GotPltSection &gotPlt,
                 sym, 0, R_ABS});
 }
 
-static void addGotEntry(Symbol &sym) {
+void elf::addGotEntry(Symbol &sym) {
   in.got->addEntry(sym);
   uint64_t off = sym.getGotOffset();
 
@@ -1055,6 +1055,10 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
     } else if (!isAbsoluteValue(sym)) {
       expr =
           target->adjustGotPcExpr(type, addend, sec->content().data() + offset);
+      // If the target adjusted the expression to R_RELAX_GOT_PC, we may end up
+      // needing the GOT if we can't relax everything.
+      if (expr == R_RELAX_GOT_PC)
+        in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
     }
   }
 
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 0559245a8f2c00..15a2b5fc177c54 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -137,6 +137,7 @@ struct JumpInstrMod {
 template <class ELFT> void scanRelocations();
 void reportUndefinedSymbols();
 void postScanRelocations();
+void addGotEntry(Symbol &sym);
 
 void hexagonTLSSymbolUpdate(ArrayRef<OutputSection *> outputSections);
 bool hexagonNeedsTLSSymbol(ArrayRef<OutputSection *> outputSections);
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index f412efa3648028..0f7ebf9d7ba840 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -583,13 +583,14 @@ static uint64_t readFdeAddr(uint8_t *buf, int size) {
 uint64_t EhFrameSection::getFdePc(uint8_t *buf, size_t fdeOff,
                                   uint8_t enc) const {
   // The starting address to which this FDE applies is
-  // stored at FDE + 8 byte.
+  // stored at FDE + 8 byte. And this offset is within
+  // the .eh_frame section.
   size_t off = fdeOff + 8;
   uint64_t addr = readFdeAddr(buf + off, enc & 0xf);
   if ((enc & 0x70) == DW_EH_PE_absptr)
     return addr;
   if ((enc & 0x70) == DW_EH_PE_pcrel)
-    return addr + getParent()->addr + off;
+    return addr + getParent()->addr + off + outSecOff;
   fatal("unknown FDE size relative encoding");
 }
 
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index 32bb2164a208b8..41990f40f68b82 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -159,6 +159,8 @@ void TargetInfo::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   uint64_t secAddr = sec.getOutputSection()->addr;
   if (auto *s = dyn_cast<InputSection>(&sec))
     secAddr += s->outSecOff;
+  else if (auto *ehIn = dyn_cast<EhInputSection>(&sec))
+    secAddr += ehIn->getParent()->outSecOff;
   for (const Relocation &rel : sec.relocs()) {
     uint8_t *loc = buf + rel.offset;
     const uint64_t val = SignExtend64(
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index d6adc4ff3d644d..ce72189015348a 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1672,6 +1672,7 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
       changed |= a32p.createFixes();
     }
 
+    finalizeSynthetic(in.got.get());
     if (in.mipsGot)
       in.mipsGot->updateAllocSize();
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 9915f07cc5a7eb..c322b776ff58f6 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -38,6 +38,9 @@ Breaking changes
 COFF Improvements
 -----------------
 
+* Added support for ``--time-trace`` and associated ``--time-trace-granularity``.
+  This generates a .json profile trace of the linker execution.
+
 MinGW Improvements
 ------------------
 
diff --git a/lld/test/COFF/comdat-drectve.s b/lld/test/COFF/comdat-drectve.s
new file mode 100644
index 00000000000000..6f96a8709fc770
--- /dev/null
+++ b/lld/test/COFF/comdat-drectve.s
@@ -0,0 +1,31 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -triple=x86_64-windows-gnu %s -filetype=obj -o %t.obj
+
+# RUN: lld-link %t.obj -out:%t.exe -debug:symtab -subsystem:console
+# RUN: llvm-readobj --coff-exports %t.exe | FileCheck %s
+
+# CHECK: Name: exportedFunc
+
+## This assembly snippet has been reduced from what Clang generates from
+## this C snippet, with -fsanitize=address. Normally, the .drectve
+## section would be a regular section - but when compiled with
+## -fsanitize=address, it becomes a comdat section.
+##
+# void exportedFunc(void) {}
+# void mainCRTStartup(void) {}
+# static __attribute__((section(".drectve"), used)) const char export_chkstk[] =
+#     "-export:exportedFunc";
+
+	.text
+	.globl	exportedFunc
+exportedFunc:
+	retq
+
+	.globl	mainCRTStartup
+mainCRTStartup:
+	retq
+
+	.section	.drectve,"dr",one_only,export_chkstk
+export_chkstk:
+	.asciz	"-export:exportedFunc"
diff --git a/lld/test/COFF/lto-emit-asm.ll b/lld/test/COFF/lto-emit-asm.ll
new file mode 100644
index 00000000000000..bb32a7d8a72709
--- /dev/null
+++ b/lld/test/COFF/lto-emit-asm.ll
@@ -0,0 +1,28 @@
+; REQUIRES: x86
+; RUN: llvm-as %s -o %t.obj
+
+; RUN: lld-link /lldemit:asm /dll /noentry /include:f1 /include:f2 %t.obj /opt:lldltopartitions=1 /out:%t.1p /lldsavetemps
+; RUN: cat %t.1p.lto.s | FileCheck %s
+; RUN: llvm-dis %t.1p.0.4.opt.bc -o - | FileCheck --check-prefix=OPT %s
+
+; RUN: lld-link /lldemit:asm /dll /noentry /include:f1 /include:f2 %t.obj /opt:lldltopartitions=2 /out:%t.2p
+; RUN: cat %t.2p.lto.s %t.2p.lto.1.s | FileCheck %s
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+;; Note: we also check for the presence of comments; /lldemit:asm output should be verbose.
+
+; CHECK-DAG: # -- Begin function f1
+; CHECK-DAG: f1:
+; OPT: define void @f1()
+define void @f1() {
+  ret void
+}
+
+; CHECK-DAG: # -- Begin function f2
+; CHECK-DAG: f2:
+; OPT: define void @f2()
+define void @f2() {
+  ret void
+}
diff --git a/lld/test/COFF/lto-emit-llvm.ll b/lld/test/COFF/lto-emit-llvm.ll
new file mode 100644
index 00000000000000..985058de10a488
--- /dev/null
+++ b/lld/test/COFF/lto-emit-llvm.ll
@@ -0,0 +1,14 @@
+; REQUIRES: x86
+; RUN: llvm-as -o %T/lto.obj %s
+
+; RUN: lld-link /lldemit:llvm /out:%T/lto.bc /entry:main /subsystem:console %T/lto.obj
+; RUN: llvm-dis %T/lto.bc -o - | FileCheck %s
+
+; CHECK: define void @main()
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @main() {
+  ret void
+}
diff --git a/lld/test/COFF/pdb-thinlto.ll b/lld/test/COFF/pdb-thinlto.ll
index 6ea0f28e1ef1e7..10559e2191650e 100644
--- a/lld/test/COFF/pdb-thinlto.ll
+++ b/lld/test/COFF/pdb-thinlto.ll
@@ -29,8 +29,8 @@ declare void @foo()
 
 ; CHECK:                           Modules
 ; CHECK: ============================================================
-; CHECK: Mod 0000 | `{{.*}}main.exe.lto.main.bc`:
-; CHECK: Obj: `{{.*}}main.exe.lto.main.bc`:
-; CHECK: Mod 0001 | `{{.*}}main.exe.lto.foo.bc`:
-; CHECK: Obj: `{{.*}}main.exe.lto.foo.bc`:
+; CHECK: Mod 0000 | `{{.*}}main.exe.lto.main.obj`:
+; CHECK: Obj: `{{.*}}main.exe.lto.main.obj`:
+; CHECK: Mod 0001 | `{{.*}}main.exe.lto.foo.obj`:
+; CHECK: Obj: `{{.*}}main.exe.lto.foo.obj`:
 ; CHECK: Mod 0002 | `* Linker *`:
diff --git a/lld/test/COFF/print-search-paths.s b/lld/test/COFF/print-search-paths.s
index 5c292aafc68a11..463cc55a793740 100644
--- a/lld/test/COFF/print-search-paths.s
+++ b/lld/test/COFF/print-search-paths.s
@@ -4,21 +4,25 @@
 # RUN: llvm-mc -triple i686-windows-msvc %s -filetype=obj -o %t_32.obj
 # RUN: lld-link -safeseh:no /dll /noentry /winsysroot:%t.dir/sysroot /vctoolsversion:1.1.1.1 /winsdkversion:10.0.1 %t_32.obj -print-search-paths | FileCheck -DSYSROOT=%t.dir -check-prefix=X86 %s
 # CHECK: Library search paths:
-# CHECK:   [[CPATH:.*]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib{{[/\\]}}windows
-# CHECK:   [[CPATH]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib
-# CHECK:   [[CPATH]]lib
-# CHECK:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}lib{{[/\\]}}x64
-# CHECK:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}atlmfc{{[/\\]}}lib{{[/\\]}}x64
-# CHECK:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}ucrt{{[/\\]}}x64
-# CHECK:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}um{{[/\\]}}x64
+# CHECK-NEXT:   (cwd)
+# CHECK-NEXT:   [[CPATH:.*]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib{{[/\\]}}windows
+# CHECK-NEXT:   [[CPATH]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib
+# CHECK-NEXT:   [[CPATH]]lib
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}DIA SDK{{[/\\]}}lib{{[/\\]}}amd64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}lib{{[/\\]}}x64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}atlmfc{{[/\\]}}lib{{[/\\]}}x64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}ucrt{{[/\\]}}x64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}um{{[/\\]}}x64
 # X86: Library search paths:
-# X86:   [[CPATH:.*]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib{{[/\\]}}windows
-# X86:   [[CPATH]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib
-# X86:   [[CPATH]]lib
-# X86:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}lib{{[/\\]}}x86
-# X86:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}atlmfc{{[/\\]}}lib{{[/\\]}}x86
-# X86:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}ucrt{{[/\\]}}x86
-# X86:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}um{{[/\\]}}x86
+# X86-NEXT:   (cwd)
+# X86-NEXT:   [[CPATH:.*]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib{{[/\\]}}windows
+# X86-NEXT:   [[CPATH]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib
+# X86-NEXT:   [[CPATH]]lib
+# X86-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}DIA SDK{{[/\\]}}lib
+# X86-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}lib{{[/\\]}}x86
+# X86-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}atlmfc{{[/\\]}}lib{{[/\\]}}x86
+# X86-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}ucrt{{[/\\]}}x86
+# X86-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}um{{[/\\]}}x86
 
 
         .text
diff --git a/lld/test/COFF/time-trace.s b/lld/test/COFF/time-trace.s
new file mode 100644
index 00000000000000..436da2483a54f6
--- /dev/null
+++ b/lld/test/COFF/time-trace.s
@@ -0,0 +1,44 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -triple=x86_64-windows-msvc -filetype=obj -o %t.obj %s
+
+# Test implicit trace file name
+# RUN: lld-link %t.obj /entry:main /out:%t1.exe --time-trace --time-trace-granularity=0
+# RUN: cat %t1.exe.time-trace \
+# RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
+# RUN:   | FileCheck %s
+
+# Test specified trace file name
+# RUN: lld-link %t.obj /entry:main /out:%t2.exe --time-trace=%t2.json --time-trace-granularity=0
+# RUN: cat %t2.json \
+# RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
+# RUN:   | FileCheck %s
+
+# Test trace requested to stdout
+# RUN: env LLD_IN_TEST=1 lld-link %t.obj /entry:main /out:%t3.exe --time-trace=- --time-trace-granularity=0 \
+# RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
+# RUN:   | FileCheck %s
+
+# CHECK:      "beginningOfTime": {{[0-9]{16},}}
+# CHECK-NEXT: "traceEvents": [
+
+# Check one event has correct fields
+# CHECK:      "dur":
+# CHECK-NEXT: "name":
+# CHECK-NEXT: "ph":
+# CHECK-NEXT: "pid":
+# CHECK-NEXT: "tid":
+# CHECK-NEXT: "ts":
+
+# Check there are events
+# CHECK: "name": "Read input files"
+
+# Check process_name entry field
+# CHECK: "name": "lld-link{{(.exe)?}}"
+# CHECK: "name": "process_name"
+# CHECK: "name": "thread_name"
+
+.text
+.global main
+main:
+	ret
diff --git a/lld/test/ELF/eh-frame-nonzero-offset-aarch64.s b/lld/test/ELF/eh-frame-nonzero-offset-aarch64.s
new file mode 100644
index 00000000000000..31e63a9c140985
--- /dev/null
+++ b/lld/test/ELF/eh-frame-nonzero-offset-aarch64.s
@@ -0,0 +1,53 @@
+// REQUIRES: aarch64
+// RUN: rm -rf %t && split-file %s %t && cd %t
+
+// RUN: llvm-mc -filetype=obj -triple=aarch64 a.s -o a.o
+// RUN: ld.lld a.o -T eh-frame-non-zero-offset.t -o non-zero
+// RUN: llvm-readelf --program-headers --unwind --symbols -x .eh_frame non-zero | FileCheck --check-prefix=NONZERO %s
+// RUN: ld.lld a.o -T eh-frame-zero-offset.t -o zero
+// RUN: llvm-readelf --program-headers --unwind --symbols -x .eh_frame zero | FileCheck --check-prefix=ZERO %s
+
+// NONZERO:      {{[0-9]+}}: 0000000000000088 {{.*}} __eh_frame_start
+// NONZERO-NEXT: {{[0-9]+}}: 00000000000000b4 {{.*}} __eh_frame_end
+
+// NONZERO:      0x00000088 10000000 00000000 017a5200 017c1e01
+// NONZERO-NEXT: 0x00000098 1b0c1f00 10000000 18000000 5cffffff
+// NONZERO-NEXT: 0x000000a8 04000000 00000000 00000000
+
+// ZERO:      {{[0-9]+}}: 0000000000000008 {{.*}} __eh_frame_start
+// ZERO-NEXT: {{[0-9]+}}: 0000000000000034 {{.*}} __eh_frame_end
+
+// ZERO:      0x00000008 10000000 00000000 017a5200 017c1e01
+// ZERO-NEXT: 0x00000018 1b0c1f00 10000000 18000000 dcffffff
+// ZERO-NEXT: 0x00000028 04000000 00000000 00000000
+
+//--- eh-frame-non-zero-offset.t
+SECTIONS {
+  .text : { *(.text .text.*) }
+  .eh_frame : {
+    /* Padding within .eh_frame */
+    . += 128;
+    __eh_frame_start = .;
+    *(.eh_frame) ;
+    __eh_frame_end = .;
+  }
+}
+
+//--- eh-frame-zero-offset.t
+SECTIONS {
+  .text : { *(.text .text.*) }
+  .eh_frame : {
+    __eh_frame_start = .;
+    *(.eh_frame) ;
+    __eh_frame_end = .;
+  }
+}
+
+//--- a.s
+.section .text.01, "ax",%progbits
+.global f1
+.type f1, %function
+f1:
+.cfi_startproc
+.space 4
+.cfi_endproc
diff --git a/lld/test/ELF/eh-frame-nonzero-offset-arm.s b/lld/test/ELF/eh-frame-nonzero-offset-arm.s
new file mode 100644
index 00000000000000..eec3943a5554c8
--- /dev/null
+++ b/lld/test/ELF/eh-frame-nonzero-offset-arm.s
@@ -0,0 +1,54 @@
+// REQUIRES: arm
+// RUN: rm -rf %t && split-file %s %t && cd %t
+
+// RUN: llvm-mc -filetype=obj -triple=arm a.s -o a.o
+// RUN: ld.lld a.o -T eh-frame-non-zero-offset.t -o non-zero
+// RUN: llvm-readelf --program-headers --unwind --symbols -x .eh_frame non-zero | FileCheck --check-prefix=NONZERO %s
+// RUN: ld.lld a.o -T eh-frame-zero-offset.t -o zero
+// RUN: llvm-readelf --program-headers --unwind --symbols -x .eh_frame zero | FileCheck --check-prefix=ZERO %s
+
+// NONZERO:      {{[0-9]+}}: 00000084 {{.*}} __eh_frame_start
+// NONZERO-NEXT: {{[0-9]+}}: 000000b0 {{.*}} __eh_frame_end
+
+// NONZERO:      0x00000084 10000000 00000000 017a5200 017c0e01
+// NONZERO-NEXT: 0x00000094 1b0c0d00 10000000 18000000 60ffffff
+// NONZERO-NEXT: 0x000000a4 04000000 00000000 00000000
+
+// ZERO:      {{[0-9]+}}: 00000004 {{.*}} __eh_frame_start
+// ZERO-NEXT: {{[0-9]+}}: 00000030 {{.*}} __eh_frame_end
+
+// ZERO:      0x00000004 10000000 00000000 017a5200 017c0e01
+// ZERO-NEXT: 0x00000014 1b0c0d00 10000000 18000000 e0ffffff
+// ZERO-NEXT: 0x00000024 04000000 00000000 00000000
+
+//--- eh-frame-non-zero-offset.t
+SECTIONS {
+  .text : { *(.text .text.*) }
+  .eh_frame : {
+    /* Padding within .eh_frame */
+    . += 128;
+    __eh_frame_start = .;
+    *(.eh_frame) ;
+    __eh_frame_end = .;
+  }
+}
+
+//--- eh-frame-zero-offset.t
+SECTIONS {
+  .text : { *(.text .text.*) }
+  .eh_frame : {
+    __eh_frame_start = .;
+    *(.eh_frame) ;
+    __eh_frame_end = .;
+  }
+}
+
+//--- a.s
+.section .text.01, "ax",%progbits
+.global f1
+.type f1, %function
+f1:
+.cfi_startproc
+.cfi_sections .eh_frame
+.space 4
+.cfi_endproc
diff --git a/lld/test/ELF/eh-frame-nonzero-offset-ppc.s b/lld/test/ELF/eh-frame-nonzero-offset-ppc.s
new file mode 100644
index 00000000000000..7b147cf34715a2
--- /dev/null
+++ b/lld/test/ELF/eh-frame-nonzero-offset-ppc.s
@@ -0,0 +1,53 @@
+// REQUIRES: ppc
+// RUN: rm -rf %t && split-file %s %t && cd %t
+
+// RUN: llvm-mc -filetype=obj -triple=ppc64le a.s -o a.o
+// RUN: ld.lld a.o -T eh-frame-non-zero-offset.t -o non-zero
+// RUN: llvm-readelf --program-headers --unwind --symbols -x .eh_frame non-zero | FileCheck --check-prefix=NONZERO %s
+// RUN: ld.lld a.o -T eh-frame-zero-offset.t -o zero
+// RUN: llvm-readelf --program-headers --unwind --symbols -x .eh_frame zero | FileCheck --check-prefix=ZERO %s
+
+// NONZERO:      {{[0-9]+}}: 0000000000000088 {{.*}} __eh_frame_start
+// NONZERO-NEXT: {{[0-9]+}}: 00000000000000b4 {{.*}} __eh_frame_end
+
+// NONZERO:      0x00000088 10000000 00000000 017a5200 04784101
+// NONZERO-NEXT: 0x00000098 1b0c0100 10000000 18000000 5cffffff
+// NONZERO-NEXT: 0x000000a8 04000000 00000000 00000000
+
+// ZERO:      {{[0-9]+}}: 0000000000000008 {{.*}} __eh_frame_start
+// ZERO-NEXT: {{[0-9]+}}: 0000000000000034 {{.*}} __eh_frame_end
+
+// ZERO:      0x00000008 10000000 00000000 017a5200 04784101
+// ZERO-NEXT: 0x00000018 1b0c0100 10000000 18000000 dcffffff
+// ZERO-NEXT: 0x00000028 04000000 00000000 00000000
+
+//--- eh-frame-non-zero-offset.t
+SECTIONS {
+  .text : { *(.text .text.*) }
+  .eh_frame : {
+    /* Padding within .eh_frame */
+    . += 128;
+    __eh_frame_start = .;
+    *(.eh_frame) ;
+    __eh_frame_end = .;
+  }
+}
+
+//--- eh-frame-zero-offset.t
+SECTIONS {
+  .text : { *(.text .text.*) }
+  .eh_frame : {
+    __eh_frame_start = .;
+    *(.eh_frame) ;
+    __eh_frame_end = .;
+  }
+}
+
+//--- a.s
+.section .text.01, "ax",%progbits
+.global f1
+.type f1, %function
+f1:
+.cfi_startproc
+.space 4
+.cfi_endproc
diff --git a/lld/test/ELF/eh-frame-nonzero-offset-x86.s b/lld/test/ELF/eh-frame-nonzero-offset-x86.s
new file mode 100644
index 00000000000000..e433d45a839397
--- /dev/null
+++ b/lld/test/ELF/eh-frame-nonzero-offset-x86.s
@@ -0,0 +1,55 @@
+// REQUIRES: x86
+// RUN: rm -rf %t && split-file %s %t && cd %t
+
+// RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+// RUN: ld.lld a.o -T eh-frame-non-zero-offset.t -o non-zero
+// RUN: llvm-readelf --program-headers --unwind --symbols -x .eh_frame non-zero | FileCheck --check-prefix=NONZERO %s
+// RUN: ld.lld a.o -T eh-frame-zero-offset.t -o zero
+// RUN: llvm-readelf --program-headers --unwind --symbols -x .eh_frame zero | FileCheck --check-prefix=ZERO %s
+
+// NONZERO:      {{[0-9]+}}: 0000000000000088 {{.*}} __eh_frame_start
+// NONZERO-NEXT: {{[0-9]+}}: 00000000000000bc {{.*}} __eh_frame_end
+
+// NONZERO:      0x00000088 14000000 00000000 017a5200 01781001
+// NONZERO-NEXT: 0x00000098 1b0c0708 90010000 14000000 1c000000
+// NONZERO-NEXT: 0x000000a8 58ffffff 04000000 00000000 00000000
+// NONZERO-NEXT: 0x000000b8 00000000
+
+// ZERO:      {{[0-9]+}}: 0000000000000008 {{.*}} __eh_frame_start
+// ZERO-NEXT: {{[0-9]+}}: 000000000000003c {{.*}} __eh_frame_end
+
+// ZERO:      0x00000008 14000000 00000000 017a5200 01781001
+// ZERO-NEXT: 0x00000018 1b0c0708 90010000 14000000 1c000000
+// ZERO-NEXT: 0x00000028 d8ffffff 04000000 00000000 00000000
+// ZERO-NEXT: 0x00000038 00000000
+
+//--- eh-frame-non-zero-offset.t
+SECTIONS {
+  .text : { *(.text .text.*) }
+  .eh_frame : {
+    /* Padding within .eh_frame */
+    . += 128;
+    __eh_frame_start = .;
+    *(.eh_frame) ;
+    __eh_frame_end = .;
+  }
+}
+
+//--- eh-frame-zero-offset.t
+SECTIONS {
+  .text : { *(.text .text.*) }
+  .eh_frame : {
+    __eh_frame_start = .;
+    *(.eh_frame) ;
+    __eh_frame_end = .;
+  }
+}
+
+//--- a.s
+.section .text.01, "ax",%progbits
+.global f1
+.type f1, %function
+f1:
+.cfi_startproc
+.space 4
+.cfi_endproc
diff --git a/lld/test/ELF/x86-64-gotpc-err.s b/lld/test/ELF/x86-64-gotpc-err.s
deleted file mode 100644
index 6fc7cbe0125c0a..00000000000000
--- a/lld/test/ELF/x86-64-gotpc-err.s
+++ /dev/null
@@ -1,26 +0,0 @@
-# REQUIRES: x86
-# RUN: split-file %s %t
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/a.s -o %t/a.o
-# RUN: not ld.lld -T %t/lds %t/a.o -o /dev/null 2>&1 | FileCheck %s
-
-## Test diagnostics for GOTPCRELX overflows. In addition, test that there is no
-## `>>> defined in` for linker synthesized __stop_* symbols (there is no
-## associated file or linker script line number).
-
-# CHECK:      error: {{.*}}:(.text+0x2): relocation R_X86_64_GOTPCRELX out of range: 2147483655 is not in [-2147483648, 2147483647]; references '__stop_data'
-# CHECK-NEXT: error: {{.*}}:(.text+0x9): relocation R_X86_64_REX_GOTPCRELX out of range: 2147483648 is not in [-2147483648, 2147483647]; references '__stop_data'
-# CHECK-NOT: error:
-
-#--- a.s
-  movl __stop_data@GOTPCREL(%rip), %eax  # out of range
-  movq __stop_data@GOTPCREL(%rip), %rax  # out of range
-  movq __stop_data@GOTPCREL(%rip), %rax  # in range
-
-.section data,"aw",@progbits
-.space 13
-
-#--- lds
-SECTIONS {
-  .text 0x200000 : { *(.text) }
-  data 0x80200000 : { *(data) }
-}
diff --git a/lld/test/ELF/x86-64-gotpc-relax-too-far.s b/lld/test/ELF/x86-64-gotpc-relax-too-far.s
new file mode 100644
index 00000000000000..74aa6d8f65a0d8
--- /dev/null
+++ b/lld/test/ELF/x86-64-gotpc-relax-too-far.s
@@ -0,0 +1,55 @@
+# REQUIRES: x86
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/a.s -o %t/a.o
+# RUN: ld.lld -T %t/lds1 %t/a.o -o %t/bin
+# RUN: llvm-objdump --no-print-imm-hex -d %t/bin | FileCheck --check-prefix=DISASM %s
+# RUN: llvm-readelf -S %t/bin | FileCheck --check-prefixes=GOT %s
+# RUN: ld.lld -T %t/lds2 %t/a.o -o %t/bin2
+# RUN: llvm-readelf -S %t/bin2 | FileCheck --check-prefixes=UNNECESSARY-GOT %s
+
+# DISASM:      <_foo>:
+# DISASM-NEXT: movl    2097146(%rip), %eax
+# DISASM:      <_start>:
+# DISASM-NEXT: movl    1048578(%rip), %eax
+# DISASM-NEXT: movq    1048571(%rip), %rax
+# DISASM-NEXT: leaq    2147483641(%rip), %rax
+# DISASM-NEXT: leal    2147483635(%rip), %eax
+
+# In our implementation, .got is retained even if all GOT-generating relocations are optimized.
+# Make sure .got still exists with the right size.
+# UNNECESSARY-GOT: .got PROGBITS 0000000000300000 101020 000000 00 WA 0 0 8
+#             GOT: .got PROGBITS 0000000000300000 102000 000010 00 WA 0 0 8
+
+#--- a.s
+.section .text.foo,"ax"
+.globl _foo
+.type _foo, @function
+_foo:
+  movl __start_data@GOTPCREL(%rip), %eax  # out of range
+
+.section .text,"ax"
+.globl _start
+.type _start, @function
+_start:
+  movl __stop_data@GOTPCREL(%rip), %eax  # out of range
+  movq __stop_data@GOTPCREL(%rip), %rax  # out of range
+  movq __stop_data@GOTPCREL(%rip), %rax  # in range
+  movl __stop_data@GOTPCREL(%rip), %eax  # in range
+
+.section data,"aw",@progbits
+.space 13
+
+#--- lds1
+SECTIONS {
+  .text.foo 0x100000 : { *(.text.foo) }
+  .text 0x200000 : { *(.text) }
+  .got 0x300000 : { *(.got) }
+  data 0x80200000 : { *(data) }
+}
+#--- lds2
+SECTIONS {
+  .text.foo 0x100000 : { *(.text.foo) }
+  .text 0x200000 : { *(.text) }
+  .got 0x300000 : { *(.got) }
+  data 0x400000 : { *(data) }
+}
diff --git a/lld/test/ELF/x86-64-gotpc-relax.s b/lld/test/ELF/x86-64-gotpc-relax.s
index 0bcc43c72e4249..de858bffc07e39 100644
--- a/lld/test/ELF/x86-64-gotpc-relax.s
+++ b/lld/test/ELF/x86-64-gotpc-relax.s
@@ -13,18 +13,18 @@
 # RUN: ld.lld --no-relax %t.o -o %t2
 # RUN: llvm-objdump --no-print-imm-hex -d %t2 | FileCheck --check-prefix=NORELAX %s
 
-## .got is removed as all GOT-generating relocations are optimized.
+## In our implementation, .got is retained even if all GOT-generating relocations are optimized.
 # CHECK:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
-# CHECK:      .iplt             PROGBITS        0000000000201210 000210 000010 00  AX  0   0 16
-# CHECK-NEXT: .got.plt          PROGBITS        0000000000202220 000220 000008 00  WA  0   0  8
+# CHECK:      .iplt             PROGBITS        0000000000201280 000280 000010 00  AX  0   0 16
+# CHECK-NEXT: .got              PROGBITS        0000000000202290 000290 000000 00  WA  0   0  8
 
 ## There is one R_X86_64_IRELATIVE relocations.
 # RELOC-LABEL: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entry:
 # CHECK:           Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# CHECK:       0000000000202220  0000000000000025 R_X86_64_IRELATIVE                        201172
+# CHECK:       0000000000203290  0000000000000025 R_X86_64_IRELATIVE                        2011e2
 # CHECK-LABEL: Hex dump of section '.got.plt':
-# NOAPPLY-NEXT:  0x00202220 00000000 00000000
-# APPLY-NEXT:    0x00202220 72112000 00000000
+# NOAPPLY-NEXT:  0x00203290 00000000 00000000
+# APPLY-NEXT:    0x00203290 e2112000 00000000
 
 # 0x201173 + 7 - 10 = 0x201170
 # 0x20117a + 7 - 17 = 0x201170
@@ -33,40 +33,40 @@
 # DISASM:      Disassembly of section .text:
 # DISASM-EMPTY:
 # DISASM-NEXT: <foo>:
-# DISASM-NEXT:   201170: 90 nop
+# DISASM-NEXT:   2011e0: 90 nop
 # DISASM:      <hid>:
-# DISASM-NEXT:   201171: 90 nop
+# DISASM-NEXT:   2011e1: 90 nop
 # DISASM:      <ifunc>:
-# DISASM-NEXT:   201172: c3 retq
+# DISASM-NEXT:   2011e2: c3 retq
 # DISASM:      <_start>:
 # DISASM-NEXT: leaq -10(%rip), %rax
 # DISASM-NEXT: leaq -17(%rip), %rax
 # DISASM-NEXT: leaq -23(%rip), %rax
 # DISASM-NEXT: leaq -30(%rip), %rax
-# DISASM-NEXT: movq 4234(%rip), %rax
-# DISASM-NEXT: movq 4227(%rip), %rax
+# DISASM-NEXT: movq 8330(%rip), %rax
+# DISASM-NEXT: movq 8323(%rip), %rax
 # DISASM-NEXT: leaq -52(%rip), %rax
 # DISASM-NEXT: leaq -59(%rip), %rax
 # DISASM-NEXT: leaq -65(%rip), %rax
 # DISASM-NEXT: leaq -72(%rip), %rax
-# DISASM-NEXT: movq 4192(%rip), %rax
-# DISASM-NEXT: movq 4185(%rip), %rax
-# DISASM-NEXT: callq 0x201170 <foo>
-# DISASM-NEXT: callq 0x201170 <foo>
-# DISASM-NEXT: callq 0x201171 <hid>
-# DISASM-NEXT: callq 0x201171 <hid>
-# DISASM-NEXT: callq *4155(%rip)
-# DISASM-NEXT: callq *4149(%rip)
-# DISASM-NEXT: jmp   0x201170 <foo>
+# DISASM-NEXT: movq 8288(%rip), %rax
+# DISASM-NEXT: movq 8281(%rip), %rax
+# DISASM-NEXT: callq 0x2011e0 <foo>
+# DISASM-NEXT: callq 0x2011e0 <foo>
+# DISASM-NEXT: callq 0x2011e1 <hid>
+# DISASM-NEXT: callq 0x2011e1 <hid>
+# DISASM-NEXT: callq *8251(%rip)
+# DISASM-NEXT: callq *8245(%rip)
+# DISASM-NEXT: jmp   0x2011e0 <foo>
 # DISASM-NEXT: nop
-# DISASM-NEXT: jmp   0x201170 <foo>
+# DISASM-NEXT: jmp   0x2011e0 <foo>
 # DISASM-NEXT: nop
-# DISASM-NEXT: jmp   0x201171 <hid>
+# DISASM-NEXT: jmp   0x2011e1 <hid>
 # DISASM-NEXT: nop
-# DISASM-NEXT: jmp   0x201171 <hid>
+# DISASM-NEXT: jmp   0x2011e1 <hid>
 # DISASM-NEXT: nop
-# DISASM-NEXT: jmpq  *4119(%rip)
-# DISASM-NEXT: jmpq  *4113(%rip)
+# DISASM-NEXT: jmpq  *8215(%rip)
+# DISASM-NEXT: jmpq  *8209(%rip)
 
 # NORELAX-LABEL: <_start>:
 # NORELAX-COUNT-12: movq
diff --git a/lld/test/wasm/emit-relocs.s b/lld/test/wasm/emit-relocs.s
index 28ce607167382e..91de6116164f72 100644
--- a/lld/test/wasm/emit-relocs.s
+++ b/lld/test/wasm/emit-relocs.s
@@ -17,6 +17,8 @@ _start:
   drop
   i32.const foo
   drop
+  i32.const __stack_low
+  drop
   end_function
 
 .section .bss.data,"",@
@@ -54,3 +56,9 @@ foo:
 # CHECK-NEXT:         Name:            ret32
 # CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
 # CHECK-NEXT:         Function:        1
+# CHECK-NEXT:       - Index:           2
+# CHECK-NEXT:         Kind:            DATA
+# CHECK-NEXT:         Name:            __stack_low
+# CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN, ABSOLUTE ]
+# CHECK-NEXT:         Offset:          1040
+# CHECK-NEXT:         Size:            0
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index 9a381e95f3d9fa..a00e336118d8c8 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -233,7 +233,8 @@ DefinedData *SymbolTable::addOptionalDataSymbol(StringRef name,
   else if (!s || s->isDefined())
     return nullptr;
   LLVM_DEBUG(dbgs() << "addOptionalDataSymbol: " << name << "\n");
-  auto *rtn = replaceSymbol<DefinedData>(s, name, WASM_SYMBOL_VISIBILITY_HIDDEN);
+  auto *rtn = replaceSymbol<DefinedData>(
+      s, name, WASM_SYMBOL_VISIBILITY_HIDDEN | WASM_SYMBOL_ABSOLUTE);
   rtn->setVA(value);
   rtn->referenced = true;
   return rtn;
@@ -243,7 +244,8 @@ DefinedData *SymbolTable::addSyntheticDataSymbol(StringRef name,
                                                  uint32_t flags) {
   LLVM_DEBUG(dbgs() << "addSyntheticDataSymbol: " << name << "\n");
   assert(!find(name));
-  return replaceSymbol<DefinedData>(insertName(name).first, name, flags);
+  return replaceSymbol<DefinedData>(insertName(name).first, name,
+                                    flags | WASM_SYMBOL_ABSOLUTE);
 }
 
 DefinedGlobal *SymbolTable::addSyntheticGlobal(StringRef name, uint32_t flags,
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index ef9547593acdcd..3a9c147161a2d3 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -665,9 +665,14 @@ void LinkingSection::writeBody() {
       } else if (isa<DataSymbol>(sym)) {
         writeStr(sub.os, sym->getName(), "sym name");
         if (auto *dataSym = dyn_cast<DefinedData>(sym)) {
-          writeUleb128(sub.os, dataSym->getOutputSegmentIndex(), "index");
-          writeUleb128(sub.os, dataSym->getOutputSegmentOffset(),
-                       "data offset");
+          if (dataSym->segment) {
+            writeUleb128(sub.os, dataSym->getOutputSegmentIndex(), "index");
+            writeUleb128(sub.os, dataSym->getOutputSegmentOffset(),
+                         "data offset");
+          } else {
+            writeUleb128(sub.os, 0, "index");
+            writeUleb128(sub.os, dataSym->getVA(), "data offset");
+          }
           writeUleb128(sub.os, dataSym->getSize(), "data size");
         }
       } else {
diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index 380016ce48015f..ce5e666a6f5e1a 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -123,7 +123,10 @@ if(APPLE AND CMAKE_GENERATOR STREQUAL Xcode)
 endif()
 
 set(LLDB_EXPORT_ALL_SYMBOLS 0 CACHE BOOL
-  "Causes lldb to export all symbols when building liblldb.")
+  "Causes lldb to export some private symbols when building liblldb. See lldb/source/API/liblldb-private.exports for the full list of symbols that get exported.")
+
+set(LLDB_EXPORT_ALL_SYMBOLS_EXPORTS_FILE "" CACHE PATH
+  "When `LLDB_EXPORT_ALL_SYMBOLS` is enabled, this specifies the exports file to use when building liblldb.")
 
 if ((NOT MSVC) OR MSVC12)
   add_definitions( -DHAVE_ROUND )
diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py
index 825b7f3787a010..29c926167fb440 100644
--- a/lldb/examples/synthetic/gnu_libstdcpp.py
+++ b/lldb/examples/synthetic/gnu_libstdcpp.py
@@ -892,3 +892,87 @@ def update(self):
         except:
             pass
         return False
+
+
+def VariantSummaryProvider(valobj, dict):
+    raw_obj = valobj.GetNonSyntheticValue()
+    index_obj = raw_obj.GetChildMemberWithName("_M_index")
+    data_obj = raw_obj.GetChildMemberWithName("_M_u")
+    if not (index_obj and index_obj.IsValid() and data_obj and data_obj.IsValid()):
+        return "<Can't find _M_index or _M_u>"
+
+    def get_variant_npos_value(index_byte_size):
+        if index_byte_size == 1:
+            return 0xFF
+        elif index_byte_size == 2:
+            return 0xFFFF
+        else:
+            return 0xFFFFFFFF
+
+    npos_value = get_variant_npos_value(index_obj.GetByteSize())
+    index = index_obj.GetValueAsUnsigned(0)
+    if index == npos_value:
+        return " No Value"
+
+    active_type = data_obj.GetType().GetTemplateArgumentType(index)
+    return f" Active Type = {active_type.GetDisplayTypeName()} "
+
+
+class VariantSynthProvider:
+    def __init__(self, valobj, dict):
+        self.raw_obj = valobj.GetNonSyntheticValue()
+        self.is_valid = False
+        self.index = None
+        self.data_obj = None
+
+    def update(self):
+        try:
+            self.index = self.raw_obj.GetChildMemberWithName(
+                "_M_index"
+            ).GetValueAsSigned(-1)
+            self.is_valid = self.index != -1
+            self.data_obj = self.raw_obj.GetChildMemberWithName("_M_u")
+        except:
+            self.is_valid = False
+        return False
+
+    def has_children(self):
+        return True
+
+    def num_children(self):
+        return 1 if self.is_valid else 0
+
+    def get_child_index(self, name):
+        return 0
+
+    def get_child_at_index(self, index):
+        if not self.is_valid:
+            return None
+        cur = 0
+        node = self.data_obj
+        while cur < self.index:
+            node = node.GetChildMemberWithName("_M_rest")
+            cur += 1
+
+        # _M_storage's type depends on variant field's type "_Type".
+        #  1. if '_Type' is literal type: _Type _M_storage.
+        #  2. otherwise, __gnu_cxx::__aligned_membuf<_Type> _M_storage.
+        #
+        # For 2. we have to cast it to underlying template _Type.
+
+        value = node.GetChildMemberWithName("_M_first").GetChildMemberWithName(
+            "_M_storage"
+        )
+        template_type = value.GetType().GetTemplateArgumentType(0)
+
+        # Literal type will return None for GetTemplateArgumentType(0)
+        if (
+            template_type
+            and "__gnu_cxx::__aligned_membuf" in value.GetType().GetDisplayTypeName()
+            and template_type.IsValid()
+        ):
+            value = value.Cast(template_type)
+
+        if value.IsValid():
+            return value.Clone("Value")
+        return None
diff --git a/lldb/include/lldb/API/SBAttachInfo.h b/lldb/include/lldb/API/SBAttachInfo.h
index ea1145e625856f..c18655fee77e0a 100644
--- a/lldb/include/lldb/API/SBAttachInfo.h
+++ b/lldb/include/lldb/API/SBAttachInfo.h
@@ -197,6 +197,7 @@ class LLDB_API SBAttachInfo {
 
 protected:
   friend class SBTarget;
+  friend class SBPlatform;
 
   friend class lldb_private::ScriptInterpreter;
 
diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index 29cf2c16fad4bd..218113a7a391f3 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -487,6 +487,7 @@ class LLDB_API SBDebugger {
   friend class SBProcess;
   friend class SBSourceManager;
   friend class SBStructuredData;
+  friend class SBPlatform;
   friend class SBTarget;
   friend class SBTrace;
 
diff --git a/lldb/include/lldb/API/SBPlatform.h b/lldb/include/lldb/API/SBPlatform.h
index 6567277a5d161e..e0acc7003a54bc 100644
--- a/lldb/include/lldb/API/SBPlatform.h
+++ b/lldb/include/lldb/API/SBPlatform.h
@@ -10,6 +10,7 @@
 #define LLDB_API_SBPLATFORM_H
 
 #include "lldb/API/SBDefines.h"
+#include "lldb/API/SBProcess.h"
 
 #include <functional>
 
@@ -18,6 +19,7 @@ struct PlatformShellCommand;
 
 namespace lldb {
 
+class SBAttachInfo;
 class SBLaunchInfo;
 
 class LLDB_API SBPlatformConnectOptions {
@@ -149,6 +151,9 @@ class LLDB_API SBPlatform {
 
   SBError Launch(SBLaunchInfo &launch_info);
 
+  SBProcess Attach(SBAttachInfo &attach_info, const SBDebugger &debugger,
+                   SBTarget &target, SBError &error);
+
   SBError Kill(const lldb::pid_t pid);
 
   SBError
diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h
index 16527bb0291fcb..8c1c81418f83d1 100644
--- a/lldb/include/lldb/API/SBProcess.h
+++ b/lldb/include/lldb/API/SBProcess.h
@@ -449,6 +449,7 @@ class LLDB_API SBProcess {
   friend class SBExecutionContext;
   friend class SBFunction;
   friend class SBModule;
+  friend class SBPlatform;
   friend class SBTarget;
   friend class SBThread;
   friend class SBValue;
diff --git a/lldb/include/lldb/Expression/DWARFExpression.h b/lldb/include/lldb/Expression/DWARFExpression.h
index 380910ba0ea3d6..5e03f539a272ca 100644
--- a/lldb/include/lldb/Expression/DWARFExpression.h
+++ b/lldb/include/lldb/Expression/DWARFExpression.h
@@ -45,7 +45,7 @@ class DWARFExpression {
   DWARFExpression(const DataExtractor &data);
 
   /// Destructor
-  virtual ~DWARFExpression();
+  ~DWARFExpression();
 
   /// Return true if the location expression contains data
   bool IsValid() const;
diff --git a/lldb/include/lldb/Utility/Instrumentation.h b/lldb/include/lldb/Utility/Instrumentation.h
index 13ffc0bd39d0b6..4a9ac810eb05e9 100644
--- a/lldb/include/lldb/Utility/Instrumentation.h
+++ b/lldb/include/lldb/Utility/Instrumentation.h
@@ -21,14 +21,12 @@
 namespace lldb_private {
 namespace instrumentation {
 
-template <typename T,
-          typename std::enable_if<std::is_fundamental<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_fundamental<T>::value, int> = 0>
 inline void stringify_append(llvm::raw_string_ostream &ss, const T &t) {
   ss << t;
 }
 
-template <typename T, typename std::enable_if<!std::is_fundamental<T>::value,
-                                              int>::type = 0>
+template <typename T, std::enable_if_t<!std::is_fundamental<T>::value, int> = 0>
 inline void stringify_append(llvm::raw_string_ostream &ss, const T &t) {
   ss << &t;
 }
diff --git a/lldb/include/lldb/Utility/XcodeSDK.h b/lldb/include/lldb/Utility/XcodeSDK.h
index 878b131a181453..f8528995d549c9 100644
--- a/lldb/include/lldb/Utility/XcodeSDK.h
+++ b/lldb/include/lldb/Utility/XcodeSDK.h
@@ -69,7 +69,7 @@ class XcodeSDK {
 
   XcodeSDK &operator=(const XcodeSDK &other);
   XcodeSDK(const XcodeSDK&) = default;
-  bool operator==(const XcodeSDK &other);
+  bool operator==(const XcodeSDK &other) const;
 
   /// Return parsed SDK type and version number.
   Info Parse() const;
diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index a0104d36df8d90..1784487323ad6b 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -196,6 +196,9 @@ def respond(self, packet):
             return self.vFile(packet)
         if packet.startswith("vRun;"):
             return self.vRun(packet)
+        if packet.startswith("qLaunchGDBServer;"):
+            _, host = packet.partition(";")[2].split(":")
+            return self.qLaunchGDBServer(host)
         if packet.startswith("qLaunchSuccess"):
             return self.qLaunchSuccess()
         if packet.startswith("QEnvironment:"):
@@ -329,6 +332,9 @@ def vFile(self, packet):
     def vRun(self, packet):
         return ""
 
+    def qLaunchGDBServer(self, host):
+        raise self.UnexpectedPacketException()
+
     def qLaunchSuccess(self):
         return ""
 
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index 7cfa3aaafdae18..a574a461d4920a 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -177,11 +177,18 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
     # from working on some systems but limits the liblldb size.
     MESSAGE("-- Symbols (liblldb): exporting all symbols from the lldb namespace")
     add_llvm_symbol_exports(liblldb ${CMAKE_CURRENT_SOURCE_DIR}/liblldb.exports)
-  else()
-    # Don't use an explicit export.  Instead, tell the linker to
-    # export all symbols.
+  elseif (NOT LLDB_EXPORT_ALL_SYMBOLS_EXPORTS_FILE)
+    # Don't use an explicit export. Instead, tell the linker to export all symbols.
     MESSAGE("-- Symbols (liblldb): exporting all symbols from the lldb and lldb_private namespaces")
+    MESSAGE(WARNING "Private LLDB symbols frequently change and no API stability is guaranteed. "
+                    "Only the SB API is guaranteed to be stable.")
     add_llvm_symbol_exports(liblldb ${CMAKE_CURRENT_SOURCE_DIR}/liblldb-private.exports)
+  else ()
+    MESSAGE("-- Symbols (liblldb): exporting all symbols specified in the exports "
+            " file '${LLDB_EXPORT_ALL_SYMBOLS_EXPORTS_FILE}'")
+    MESSAGE(WARNING "Private LLDB symbols frequently change and no API stability is guaranteed. "
+                    "Only the SB API is guaranteed to be stable.")
+    add_llvm_symbol_exports(liblldb "${LLDB_EXPORT_ALL_SYMBOLS_EXPORTS_FILE}")
   endif()
   set_target_properties(liblldb_exports PROPERTIES FOLDER "lldb misc")
 elseif (LLDB_EXPORT_ALL_SYMBOLS)
diff --git a/lldb/source/API/SBPlatform.cpp b/lldb/source/API/SBPlatform.cpp
index f8300a5bab30e4..c31848fe04ea72 100644
--- a/lldb/source/API/SBPlatform.cpp
+++ b/lldb/source/API/SBPlatform.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBPlatform.h"
+#include "lldb/API/SBDebugger.h"
 #include "lldb/API/SBEnvironment.h"
 #include "lldb/API/SBError.h"
 #include "lldb/API/SBFileSpec.h"
 #include "lldb/API/SBLaunchInfo.h"
 #include "lldb/API/SBModuleSpec.h"
 #include "lldb/API/SBPlatform.h"
+#include "lldb/API/SBTarget.h"
 #include "lldb/API/SBUnixSignals.h"
 #include "lldb/Host/File.h"
 #include "lldb/Target/Platform.h"
@@ -574,6 +576,29 @@ SBError SBPlatform::Launch(SBLaunchInfo &launch_info) {
   });
 }
 
+SBProcess SBPlatform::Attach(SBAttachInfo &attach_info,
+                             const SBDebugger &debugger, SBTarget &target,
+                             SBError &error) {
+  LLDB_INSTRUMENT_VA(this, attach_info, debugger, target, error);
+
+  if (PlatformSP platform_sp = GetSP()) {
+    if (platform_sp->IsConnected()) {
+      ProcessAttachInfo &info = attach_info.ref();
+      Status status;
+      ProcessSP process_sp = platform_sp->Attach(info, debugger.ref(),
+                                                 target.GetSP().get(), status);
+      error.SetError(status);
+      return SBProcess(process_sp);
+    }
+
+    error.SetErrorString("not connected");
+    return {};
+  }
+
+  error.SetErrorString("invalid platform");
+  return {};
+}
+
 SBError SBPlatform::Kill(const lldb::pid_t pid) {
   LLDB_INSTRUMENT_VA(this, pid);
   return ExecuteConnected([&](const lldb::PlatformSP &platform_sp) {
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index 04830b8b990efa..cfdeaabfa00774 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -390,7 +390,7 @@ let Command = "expression" in {
     Arg<"Boolean">,
     Desc<"Persist expression result in a variable for subsequent use. "
     "Expression results will be labeled with $-prefixed variables, e.g. $0, "
-    "$1, etc. Defaults to true.">;
+    "$1, etc.">;
 }
 
 let Command = "frame diag" in {
diff --git a/lldb/source/Plugins/DynamicLoader/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
index f357fea02efbe6..30607159acdc08 100644
--- a/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
+++ b/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(Darwin-Kernel)
+add_subdirectory(FreeBSD-Kernel)
 add_subdirectory(MacOSX-DYLD)
 add_subdirectory(POSIX-DYLD)
 add_subdirectory(Static)
diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
index 378b2472278605..5aeaf3ae24d7c7 100644
--- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
+++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
@@ -606,8 +606,8 @@ void DynamicLoaderDarwinKernel::KextImageInfo::SetProcessStopId(
   m_load_process_stop_id = stop_id;
 }
 
-bool DynamicLoaderDarwinKernel::KextImageInfo::
-operator==(const KextImageInfo &rhs) {
+bool DynamicLoaderDarwinKernel::KextImageInfo::operator==(
+    const KextImageInfo &rhs) const {
   if (m_uuid.IsValid() || rhs.GetUUID().IsValid()) {
     return m_uuid == rhs.GetUUID();
   }
diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h
index 38a60d154820a9..000c382b2c0111 100644
--- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h
+++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.h
@@ -176,7 +176,7 @@ class DynamicLoaderDarwinKernel : public lldb_private::DynamicLoader {
 
     void SetProcessStopId(uint32_t stop_id);
 
-    bool operator==(const KextImageInfo &rhs);
+    bool operator==(const KextImageInfo &rhs) const;
 
     uint32_t GetAddressByteSize(); // as determined by Mach-O header
 
diff --git a/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/CMakeLists.txt
new file mode 100644
index 00000000000000..76daf0a327cf97
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_lldb_library(lldbPluginDynamicLoaderFreeBSDKernel PLUGIN
+  DynamicLoaderFreeBSDKernel.cpp
+
+  LINK_LIBS
+    lldbBreakpoint
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbSymbol
+    lldbTarget
+    lldbUtility
+    lldbPluginObjectFileELF
+  )
diff --git a/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp
new file mode 100644
index 00000000000000..a1bf8efb064b61
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp
@@ -0,0 +1,789 @@
+//===-- DynamicLoaderFreeBSDKernel.cpp
+//------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Breakpoint/StoppointCallbackContext.h"
+#include "lldb/Core/Debugger.h"
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ModuleSpec.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Core/Section.h"
+#include "lldb/Host/StreamFile.h"
+#include "lldb/Interpreter/OptionValueProperties.h"
+#include "lldb/Symbol/LocateSymbolFile.h"
+#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Target/OperatingSystem.h"
+#include "lldb/Target/RegisterContext.h"
+#include "lldb/Target/StackFrame.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Target/Thread.h"
+#include "lldb/Target/ThreadPlanRunToAddress.h"
+#include "lldb/Utility/DataBuffer.h"
+#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/State.h"
+
+#include "Plugins/ObjectFile/ELF/ObjectFileELF.h"
+
+#include "DynamicLoaderFreeBSDKernel.h"
+#include <memory>
+#include <mutex>
+
+using namespace lldb;
+using namespace lldb_private;
+
+LLDB_PLUGIN_DEFINE(DynamicLoaderFreeBSDKernel)
+
+void DynamicLoaderFreeBSDKernel::Initialize() {
+  PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                GetPluginDescriptionStatic(), CreateInstance,
+                                DebuggerInit);
+}
+
+void DynamicLoaderFreeBSDKernel::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
+llvm::StringRef DynamicLoaderFreeBSDKernel::GetPluginDescriptionStatic() {
+  return "The Dynamic Loader Plugin For FreeBSD Kernel";
+}
+
+static bool is_kernel(Module *module) {
+  if (!module)
+    return false;
+
+  ObjectFile *objfile = module->GetObjectFile();
+  if (!objfile)
+    return false;
+  if (objfile->GetType() != ObjectFile::eTypeExecutable)
+    return false;
+  if (objfile->GetStrata() != ObjectFile::eStrataUnknown &&
+      objfile->GetStrata() != ObjectFile::eStrataKernel)
+    return false;
+
+  return true;
+}
+
+static bool is_kmod(Module *module) {
+  if (!module)
+    return false;
+  if (!module->GetObjectFile())
+    return false;
+  ObjectFile *objfile = module->GetObjectFile();
+  if (objfile->GetType() != ObjectFile::eTypeObjectFile &&
+      objfile->GetType() != ObjectFile::eTypeSharedLibrary)
+    return false;
+
+  return true;
+}
+
+static bool is_reloc(Module *module) {
+  if (!module)
+    return false;
+  if (!module->GetObjectFile())
+    return false;
+  ObjectFile *objfile = module->GetObjectFile();
+  if (objfile->GetType() != ObjectFile::eTypeObjectFile)
+    return false;
+
+  return true;
+}
+
+// Instantiate Function of the FreeBSD Kernel Dynamic Loader Plugin called when
+// Register the Plugin
+DynamicLoader *
+DynamicLoaderFreeBSDKernel::CreateInstance(lldb_private::Process *process,
+                                           bool force) {
+  // Check the environment when the plugin is not force loaded
+  Module *exec = process->GetTarget().GetExecutableModulePointer();
+  if (exec && !is_kernel(exec)) {
+    return nullptr;
+  }
+  if (!force) {
+    // Check if the target is kernel
+    const llvm::Triple &triple_ref =
+        process->GetTarget().GetArchitecture().GetTriple();
+    if (!triple_ref.isOSFreeBSD()) {
+      return nullptr;
+    }
+  }
+
+  // At this point we have checked the target is a FreeBSD kernel and all we
+  // have to do is to find the kernel address
+  const addr_t kernel_address = FindFreeBSDKernel(process);
+
+  if (CheckForKernelImageAtAddress(process, kernel_address).IsValid())
+    return new DynamicLoaderFreeBSDKernel(process, kernel_address);
+
+  return nullptr;
+}
+
+addr_t
+DynamicLoaderFreeBSDKernel::FindFreeBSDKernel(lldb_private::Process *process) {
+  addr_t kernel_addr = process->GetImageInfoAddress();
+  if (kernel_addr == LLDB_INVALID_ADDRESS)
+    kernel_addr = FindKernelAtLoadAddress(process);
+  return kernel_addr;
+}
+
+// Get the kernel address if the kernel is not loaded with a slide
+addr_t DynamicLoaderFreeBSDKernel::FindKernelAtLoadAddress(
+    lldb_private::Process *process) {
+  Module *exe_module = process->GetTarget().GetExecutableModulePointer();
+
+  if (!is_kernel(exe_module))
+    return LLDB_INVALID_ADDRESS;
+
+  ObjectFile *exe_objfile = exe_module->GetObjectFile();
+
+  if (!exe_objfile->GetBaseAddress().IsValid())
+    return LLDB_INVALID_ADDRESS;
+
+  if (CheckForKernelImageAtAddress(
+          process, exe_objfile->GetBaseAddress().GetFileAddress())
+          .IsValid())
+    return exe_objfile->GetBaseAddress().GetFileAddress();
+
+  return LLDB_INVALID_ADDRESS;
+}
+
+// Read ELF header from memry and return
+bool DynamicLoaderFreeBSDKernel::ReadELFHeader(Process *process,
+                                               lldb::addr_t addr,
+                                               llvm::ELF::Elf32_Ehdr &header,
+                                               bool *read_error) {
+  Status error;
+  if (read_error)
+    *read_error = false;
+
+  if (process->ReadMemory(addr, &header, sizeof(header), error) !=
+      sizeof(header)) {
+    if (read_error)
+      *read_error = true;
+    return false;
+  }
+
+  if (!header.checkMagic())
+    return false;
+
+  return true;
+}
+
+// Check the correctness of Kernel and return UUID
+lldb_private::UUID DynamicLoaderFreeBSDKernel::CheckForKernelImageAtAddress(
+    Process *process, lldb::addr_t addr, bool *read_error) {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+
+  if (addr == LLDB_INVALID_ADDRESS) {
+    if (read_error)
+      *read_error = true;
+    return UUID();
+  }
+
+  LLDB_LOGF(log,
+            "DynamicLoaderFreeBSDKernel::CheckForKernelImageAtAddress: "
+            "looking for kernel binary at 0x%" PRIx64,
+            addr);
+
+  llvm::ELF::Elf32_Ehdr header;
+  if (!ReadELFHeader(process, addr, header)) {
+    *read_error = true;
+    return UUID();
+  }
+
+  // Check header type
+  if (header.e_type != llvm::ELF::ET_EXEC)
+    return UUID();
+
+  ModuleSP memory_module_sp =
+      process->ReadModuleFromMemory(FileSpec("temp_freebsd_kernel"), addr);
+
+  if (!memory_module_sp.get()) {
+    *read_error = true;
+    return UUID();
+  }
+
+  ObjectFile *exe_objfile = memory_module_sp->GetObjectFile();
+  if (exe_objfile == nullptr) {
+    LLDB_LOGF(log,
+              "DynamicLoaderFreeBSDKernel::CheckForKernelImageAtAddress "
+              "found a binary at 0x%" PRIx64
+              " but could not create an object file from memory",
+              addr);
+    return UUID();
+  }
+
+  // In here, I should check is_kernel for memory_module_sp
+  // However, the ReadModuleFromMemory reads wrong section so that this check
+  // will failed
+  ArchSpec kernel_arch(llvm::ELF::convertEMachineToArchName(header.e_machine));
+
+  if (!process->GetTarget().GetArchitecture().IsCompatibleMatch(kernel_arch))
+    process->GetTarget().SetArchitecture(kernel_arch);
+
+  std::string uuid_str;
+  if (memory_module_sp->GetUUID().IsValid()) {
+    uuid_str = "with UUID ";
+    uuid_str += memory_module_sp->GetUUID().GetAsString();
+  } else {
+    uuid_str = "and no LC_UUID found in load commands ";
+  }
+  LLDB_LOGF(log,
+            "DynamicLoaderFreeBSDKernel::CheckForKernelImageAtAddress: "
+            "kernel binary image found at 0x%" PRIx64 " with arch '%s' %s",
+            addr, kernel_arch.GetTriple().str().c_str(), uuid_str.c_str());
+
+  return memory_module_sp->GetUUID();
+}
+
+void DynamicLoaderFreeBSDKernel::DebuggerInit(
+    lldb_private::Debugger &debugger) {}
+
+DynamicLoaderFreeBSDKernel::DynamicLoaderFreeBSDKernel(Process *process,
+                                                       addr_t kernel_address)
+    : DynamicLoader(process), m_process(process),
+      m_linker_file_list_struct_addr(LLDB_INVALID_ADDRESS),
+      m_linker_file_head_addr(LLDB_INVALID_ADDRESS),
+      m_kernel_load_address(kernel_address), m_mutex() {
+  process->SetCanRunCode(false);
+}
+
+DynamicLoaderFreeBSDKernel::~DynamicLoaderFreeBSDKernel() { Clear(true); }
+
+void DynamicLoaderFreeBSDKernel::Update() {
+  LoadKernelModules();
+  SetNotificationBreakPoint();
+}
+
+// Create in memory Module at the load address
+bool DynamicLoaderFreeBSDKernel::KModImageInfo::ReadMemoryModule(
+    lldb_private::Process *process) {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+  if (m_memory_module_sp)
+    return true;
+  if (m_load_address == LLDB_INVALID_ADDRESS)
+    return false;
+
+  FileSpec file_spec(m_name);
+
+  ModuleSP memory_module_sp;
+
+  llvm::ELF::Elf32_Ehdr elf_eheader;
+  size_t size_to_read = 512;
+
+  if (ReadELFHeader(process, m_load_address, elf_eheader)) {
+    if (elf_eheader.e_ident[llvm::ELF::EI_CLASS] == llvm::ELF::ELFCLASS32) {
+      size_to_read = sizeof(llvm::ELF::Elf32_Ehdr) +
+                     elf_eheader.e_phnum * elf_eheader.e_phentsize;
+    } else if (elf_eheader.e_ident[llvm::ELF::EI_CLASS] ==
+               llvm::ELF::ELFCLASS64) {
+      llvm::ELF::Elf64_Ehdr elf_eheader;
+      Status error;
+      if (process->ReadMemory(m_load_address, &elf_eheader, sizeof(elf_eheader),
+                              error) == sizeof(elf_eheader))
+        size_to_read = sizeof(llvm::ELF::Elf64_Ehdr) +
+                       elf_eheader.e_phnum * elf_eheader.e_phentsize;
+    }
+  }
+
+  memory_module_sp =
+      process->ReadModuleFromMemory(file_spec, m_load_address, size_to_read);
+
+  if (!memory_module_sp)
+    return false;
+
+  bool this_is_kernel = is_kernel(memory_module_sp.get());
+
+  if (!m_uuid.IsValid() && memory_module_sp->GetUUID().IsValid())
+    m_uuid = memory_module_sp->GetUUID();
+
+  m_memory_module_sp = memory_module_sp;
+  m_is_kernel = this_is_kernel;
+
+  // The kernel binary is from memory
+  if (this_is_kernel) {
+    LLDB_LOGF(log, "KextImageInfo::ReadMemoryModule read the kernel binary out "
+                   "of memory");
+
+    if (memory_module_sp->GetArchitecture().IsValid())
+      process->GetTarget().SetArchitecture(memory_module_sp->GetArchitecture());
+  }
+
+  return true;
+}
+
+bool DynamicLoaderFreeBSDKernel::KModImageInfo::LoadImageUsingMemoryModule(
+    lldb_private::Process *process) {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+
+  if (IsLoaded())
+    return true;
+
+  Target &target = process->GetTarget();
+
+  if (IsKernel() && m_uuid.IsValid()) {
+    Stream &s = target.GetDebugger().GetOutputStream();
+    s.Printf("Kernel UUID: %s\n", m_uuid.GetAsString().c_str());
+    s.Printf("Load Address: 0x%" PRIx64 "\n", m_load_address);
+  }
+
+  // Test if the module is loaded into the taget,
+  // maybe the module is loaded manually by user by doing target module add
+  // So that we have to create the module manually
+  if (!m_module_sp) {
+    const ModuleList &target_images = target.GetImages();
+    m_module_sp = target_images.FindModule(m_uuid);
+
+    // Search in the file system
+    if (!m_module_sp) {
+      ModuleSpec module_spec(FileSpec(GetPath()), target.GetArchitecture());
+      if (IsKernel()) {
+        Status error;
+        if (Symbols::DownloadObjectAndSymbolFile(module_spec, error, true)) {
+          if (FileSystem::Instance().Exists(module_spec.GetFileSpec()))
+            m_module_sp = std::make_shared<Module>(module_spec.GetFileSpec(),
+                                                   target.GetArchitecture());
+        }
+      }
+
+      if (!m_module_sp)
+        m_module_sp = target.GetOrCreateModule(module_spec, true);
+      if (IsKernel() && !m_module_sp) {
+        Stream &s = target.GetDebugger().GetOutputStream();
+        s.Printf("WARNING: Unable to locate kernel binary on the debugger "
+                 "system.\n");
+      }
+    }
+
+    if (m_module_sp) {
+      // If the file is not kernel or kmod, the target should be loaded once and
+      // don't reload again
+      if (!IsKernel() && !is_kmod(m_module_sp.get())) {
+        ModuleSP existing_module_sp = target.GetImages().FindModule(m_uuid);
+        if (existing_module_sp &&
+            existing_module_sp->IsLoadedInTarget(&target)) {
+          LLDB_LOGF(log,
+                    "'%s' with UUID %s is not a kmod or kernel, and is "
+                    "already registered in target, not loading.",
+                    m_name.c_str(), m_uuid.GetAsString().c_str());
+          return true;
+        }
+      }
+      m_uuid = m_module_sp->GetUUID();
+
+      // or append to the images
+      target.GetImages().AppendIfNeeded(m_module_sp, false);
+    }
+  }
+
+  // If this file is relocatable kernel module(x86_64), adjust it's
+  // section(PT_LOAD segment) and return Because the kernel module's load
+  // address is the text section. lldb cannot create full memory module upon
+  // relocatable file So what we do is to set the load address only.
+  if (is_kmod(m_module_sp.get()) && is_reloc(m_module_sp.get())) {
+    m_stop_id = process->GetStopID();
+    bool changed = false;
+    m_module_sp->SetLoadAddress(target, m_load_address, true, changed);
+    return true;
+  }
+
+  if (m_module_sp)
+    ReadMemoryModule(process);
+
+  // Calculate the slides of in memory module
+  if (!m_memory_module_sp || !m_module_sp) {
+    m_module_sp.reset();
+    return false;
+  }
+
+  ObjectFile *ondisk_object_file = m_module_sp->GetObjectFile();
+  ObjectFile *memory_object_file = m_memory_module_sp->GetObjectFile();
+
+  if (!ondisk_object_file || !memory_object_file)
+    m_module_sp.reset();
+
+  // Find the slide address
+  addr_t fixed_slide = LLDB_INVALID_ADDRESS;
+  if (ObjectFileELF *memory_objfile_elf =
+          llvm::dyn_cast<ObjectFileELF>(memory_object_file)) {
+    addr_t load_address = memory_object_file->GetBaseAddress().GetFileAddress();
+
+    if (load_address != LLDB_INVALID_ADDRESS &&
+        m_load_address != load_address) {
+      fixed_slide = m_load_address - load_address;
+      LLDB_LOGF(log,
+                "kmod %s in-memory LOAD vmaddr is not correct, using a "
+                "fixed slide of 0x%" PRIx64,
+                m_name.c_str(), fixed_slide);
+    }
+  }
+
+  SectionList *ondisk_section_list = ondisk_object_file->GetSectionList();
+  SectionList *memory_section_list = memory_object_file->GetSectionList();
+
+  if (memory_section_list && ondisk_object_file) {
+    const uint32_t num_ondisk_sections = ondisk_section_list->GetSize();
+    uint32_t num_load_sections = 0;
+
+    for (uint32_t section_idx = 0; section_idx < num_ondisk_sections;
+         ++section_idx) {
+      SectionSP on_disk_section_sp =
+          ondisk_section_list->GetSectionAtIndex(section_idx);
+
+      if (!on_disk_section_sp)
+        continue;
+      if (fixed_slide != LLDB_INVALID_ADDRESS) {
+        target.SetSectionLoadAddress(on_disk_section_sp,
+                                     on_disk_section_sp->GetFileAddress() +
+                                         fixed_slide);
+
+      } else {
+        const Section *memory_section =
+            memory_section_list
+                ->FindSectionByName(on_disk_section_sp->GetName())
+                .get();
+        if (memory_section) {
+          target.SetSectionLoadAddress(on_disk_section_sp,
+                                       memory_section->GetFileAddress());
+          ++num_load_sections;
+        }
+      }
+    }
+
+    if (num_load_sections)
+      m_stop_id = process->GetStopID();
+    else
+      m_module_sp.reset();
+  } else {
+    m_module_sp.reset();
+  }
+
+  if (IsLoaded() && m_module_sp && IsKernel()) {
+    Stream &s = target.GetDebugger().GetOutputStream();
+    ObjectFile *kernel_object_file = m_module_sp->GetObjectFile();
+    if (kernel_object_file) {
+      addr_t file_address =
+          kernel_object_file->GetBaseAddress().GetFileAddress();
+      if (m_load_address != LLDB_INVALID_ADDRESS &&
+          file_address != LLDB_INVALID_ADDRESS) {
+        s.Printf("Kernel slide 0x%" PRIx64 " in memory.\n",
+                 m_load_address - file_address);
+        s.Printf("Loaded kernel file %s\n",
+                 m_module_sp->GetFileSpec().GetPath().c_str());
+      }
+    }
+    s.Flush();
+  }
+
+  return IsLoaded();
+}
+
+// This function is work for kernel file, others it wil reset load address and
+// return false
+bool DynamicLoaderFreeBSDKernel::KModImageInfo::LoadImageUsingFileAddress(
+    lldb_private::Process *process) {
+  if (IsLoaded())
+    return true;
+
+  if (m_module_sp) {
+    bool changed = false;
+    if (m_module_sp->SetLoadAddress(process->GetTarget(), 0, true, changed))
+      m_stop_id = process->GetStopID();
+  }
+
+  return false;
+}
+
+// Get the head of found_list
+bool DynamicLoaderFreeBSDKernel::ReadKmodsListHeader() {
+  std::lock_guard<decltype(m_mutex)> guard(m_mutex);
+
+  if (m_linker_file_list_struct_addr.IsValid()) {
+    // Get tqh_first struct element from linker_files
+    Status error;
+    addr_t address = m_process->ReadPointerFromMemory(
+        m_linker_file_list_struct_addr.GetLoadAddress(&m_process->GetTarget()),
+        error);
+    if (address != LLDB_INVALID_ADDRESS && error.Success()) {
+      m_linker_file_head_addr = Address(address);
+    } else {
+      m_linker_file_list_struct_addr.Clear();
+      return false;
+    }
+
+    if (!m_linker_file_head_addr.IsValid() ||
+        m_linker_file_head_addr.GetFileAddress() == 0) {
+      m_linker_file_list_struct_addr.Clear();
+      return false;
+    }
+  }
+  return true;
+}
+
+// Parse Kmod info in found_list
+bool DynamicLoaderFreeBSDKernel::ParseKmods(Address linker_files_head_addr) {
+  std::lock_guard<decltype(m_mutex)> guard(m_mutex);
+  KModImageInfo::collection_type linker_files_list;
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+
+  if (!ReadAllKmods(linker_files_head_addr, linker_files_list))
+    return false;
+  LLDB_LOGF(
+      log,
+      "Kmod-changed breakpoint hit, there are %zu kernel modules currently.\n",
+      linker_files_list.size());
+
+  ModuleList &modules = m_process->GetTarget().GetImages();
+  ModuleList remove_modules;
+  ModuleList add_modules;
+
+  for (ModuleSP module : modules.Modules()) {
+    if (is_kernel(module.get()))
+      continue;
+    if (is_kmod(module.get()))
+      remove_modules.AppendIfNeeded(module);
+  }
+
+  m_process->GetTarget().ModulesDidUnload(remove_modules, false);
+
+  for (KModImageInfo &image_info : linker_files_list) {
+    if (m_kld_name_to_uuid.find(image_info.GetName()) !=
+        m_kld_name_to_uuid.end())
+      image_info.SetUUID(m_kld_name_to_uuid[image_info.GetName()]);
+    bool failed_to_load = false;
+    if (!image_info.LoadImageUsingMemoryModule(m_process)) {
+      image_info.LoadImageUsingFileAddress(m_process);
+      failed_to_load = true;
+    } else {
+      m_linker_files_list.push_back(image_info);
+      m_kld_name_to_uuid[image_info.GetName()] = image_info.GetUUID();
+    }
+
+    if (!failed_to_load)
+      add_modules.AppendIfNeeded(image_info.GetModule());
+  }
+  m_process->GetTarget().ModulesDidLoad(add_modules);
+  return true;
+}
+
+// Read all kmod from a given arrays of list
+bool DynamicLoaderFreeBSDKernel::ReadAllKmods(
+    Address linker_files_head_addr,
+    KModImageInfo::collection_type &kmods_list) {
+
+  // Get offset of next member and load address symbol
+  static ConstString kld_off_address_symbol_name("kld_off_address");
+  static ConstString kld_off_next_symbol_name("kld_off_next");
+  static ConstString kld_off_filename_symbol_name("kld_off_filename");
+  static ConstString kld_off_pathname_symbol_name("kld_off_pathname");
+  const Symbol *kld_off_address_symbol =
+      m_kernel_image_info.GetModule()->FindFirstSymbolWithNameAndType(
+          kld_off_address_symbol_name, eSymbolTypeData);
+  const Symbol *kld_off_next_symbol =
+      m_kernel_image_info.GetModule()->FindFirstSymbolWithNameAndType(
+          kld_off_next_symbol_name, eSymbolTypeData);
+  const Symbol *kld_off_filename_symbol =
+      m_kernel_image_info.GetModule()->FindFirstSymbolWithNameAndType(
+          kld_off_filename_symbol_name, eSymbolTypeData);
+  const Symbol *kld_off_pathname_symbol =
+      m_kernel_image_info.GetModule()->FindFirstSymbolWithNameAndType(
+          kld_off_pathname_symbol_name, eSymbolTypeData);
+
+  if (!kld_off_address_symbol || !kld_off_next_symbol ||
+      !kld_off_filename_symbol || !kld_off_pathname_symbol)
+    return false;
+
+  Status error;
+  const int32_t kld_off_address = m_process->ReadSignedIntegerFromMemory(
+      kld_off_address_symbol->GetAddress().GetLoadAddress(
+          &m_process->GetTarget()),
+      4, 0, error);
+  if (error.Fail())
+    return false;
+  const int32_t kld_off_next = m_process->ReadSignedIntegerFromMemory(
+      kld_off_next_symbol->GetAddress().GetLoadAddress(&m_process->GetTarget()),
+      4, 0, error);
+  if (error.Fail())
+    return false;
+  const int32_t kld_off_filename = m_process->ReadSignedIntegerFromMemory(
+      kld_off_filename_symbol->GetAddress().GetLoadAddress(
+          &m_process->GetTarget()),
+      4, 0, error);
+  if (error.Fail())
+    return false;
+
+  const int32_t kld_off_pathname = m_process->ReadSignedIntegerFromMemory(
+      kld_off_pathname_symbol->GetAddress().GetLoadAddress(
+          &m_process->GetTarget()),
+      4, 0, error);
+  if (error.Fail())
+    return false;
+
+  // Parse KMods
+  addr_t kld_load_addr(LLDB_INVALID_ADDRESS);
+  char kld_filename[255];
+  char kld_pathname[255];
+  addr_t current_kld =
+      linker_files_head_addr.GetLoadAddress(&m_process->GetTarget());
+
+  while (current_kld != 0) {
+    addr_t kld_filename_addr =
+        m_process->ReadPointerFromMemory(current_kld + kld_off_filename, error);
+    if (error.Fail())
+      return false;
+    addr_t kld_pathname_addr =
+        m_process->ReadPointerFromMemory(current_kld + kld_off_pathname, error);
+    if (error.Fail())
+      return false;
+
+    m_process->ReadCStringFromMemory(kld_filename_addr, kld_filename,
+                                     sizeof(kld_filename), error);
+    if (error.Fail())
+      return false;
+    m_process->ReadCStringFromMemory(kld_pathname_addr, kld_pathname,
+                                     sizeof(kld_pathname), error);
+    if (error.Fail())
+      return false;
+    kld_load_addr =
+        m_process->ReadPointerFromMemory(current_kld + kld_off_address, error);
+    if (error.Fail())
+      return false;
+
+    kmods_list.emplace_back();
+    KModImageInfo &kmod_info = kmods_list.back();
+    kmod_info.SetName(kld_filename);
+    kmod_info.SetLoadAddress(kld_load_addr);
+    kmod_info.SetPath(kld_pathname);
+
+    current_kld =
+        m_process->ReadPointerFromMemory(current_kld + kld_off_next, error);
+    if (kmod_info.GetName() == "kernel")
+      kmods_list.pop_back();
+    if (error.Fail())
+      return false;
+  }
+
+  return true;
+}
+
+// Read all kmods
+void DynamicLoaderFreeBSDKernel::ReadAllKmods() {
+  std::lock_guard<decltype(m_mutex)> guard(m_mutex);
+
+  if (ReadKmodsListHeader()) {
+    if (m_linker_file_head_addr.IsValid()) {
+      if (!ParseKmods(m_linker_file_head_addr))
+        m_linker_files_list.clear();
+    }
+  }
+}
+
+// Load all Kernel Modules
+void DynamicLoaderFreeBSDKernel::LoadKernelModules() {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+  LLDB_LOGF(log, "DynamicLoaderFreeBSDKernel::LoadKernelModules "
+                 "Start loading Kernel Module");
+
+  // Initialize Kernel Image Information at the first time
+  if (m_kernel_image_info.GetLoadAddress() == LLDB_INVALID_ADDRESS) {
+    ModuleSP module_sp = m_process->GetTarget().GetExecutableModule();
+    if (is_kernel(module_sp.get())) {
+      m_kernel_image_info.SetModule(module_sp);
+      m_kernel_image_info.SetIsKernel(true);
+    }
+
+    // Set name for kernel
+    llvm::StringRef kernel_name("freebsd_kernel");
+    module_sp = m_kernel_image_info.GetModule();
+    if (module_sp.get() && module_sp->GetObjectFile() &&
+        !module_sp->GetObjectFile()->GetFileSpec().GetFilename().IsEmpty())
+      kernel_name = module_sp->GetObjectFile()
+                        ->GetFileSpec()
+                        .GetFilename()
+                        .GetStringRef();
+    m_kernel_image_info.SetName(kernel_name.data());
+
+    if (m_kernel_image_info.GetLoadAddress() == LLDB_INVALID_ADDRESS) {
+      m_kernel_image_info.SetLoadAddress(m_kernel_load_address);
+    }
+
+    // Build In memory Module
+    if (m_kernel_image_info.GetLoadAddress() != LLDB_INVALID_ADDRESS) {
+      // If the kernel is not loaded in the memory, use file to load
+      if (!m_kernel_image_info.LoadImageUsingMemoryModule(m_process))
+        m_kernel_image_info.LoadImageUsingFileAddress(m_process);
+    }
+  }
+
+  LoadOperatingSystemPlugin(false);
+
+  if (!m_kernel_image_info.IsLoaded() || !m_kernel_image_info.GetModule()) {
+    m_kernel_image_info.Clear();
+    return;
+  }
+
+  static ConstString modlist_symbol_name("linker_files");
+
+  const Symbol *symbol =
+      m_kernel_image_info.GetModule()->FindFirstSymbolWithNameAndType(
+          modlist_symbol_name, lldb::eSymbolTypeData);
+
+  if (symbol) {
+    m_linker_file_list_struct_addr = symbol->GetAddress();
+    ReadAllKmods();
+  } else {
+    LLDB_LOGF(log, "DynamicLoaderFreeBSDKernel::LoadKernelModules "
+                   "cannot file modlist symbol");
+  }
+}
+
+// Update symbol when use kldload by setting callback function on kldload
+void DynamicLoaderFreeBSDKernel::SetNotificationBreakPoint() {}
+
+// Hook called when attach to a process
+void DynamicLoaderFreeBSDKernel::DidAttach() {
+  PrivateInitialize(m_process);
+  Update();
+}
+
+// Hook called after attach to a process
+void DynamicLoaderFreeBSDKernel::DidLaunch() {
+  PrivateInitialize(m_process);
+  Update();
+}
+
+// Clear all member except kernel address
+void DynamicLoaderFreeBSDKernel::Clear(bool clear_process) {
+  std::lock_guard<decltype(m_mutex)> guard(m_mutex);
+  if (clear_process)
+    m_process = nullptr;
+  m_linker_file_head_addr.Clear();
+  m_linker_file_list_struct_addr.Clear();
+  m_kernel_image_info.Clear();
+  m_linker_files_list.clear();
+}
+
+// Reinitialize class
+void DynamicLoaderFreeBSDKernel::PrivateInitialize(Process *process) {
+  Clear(true);
+  m_process = process;
+}
+
+ThreadPlanSP DynamicLoaderFreeBSDKernel::GetStepThroughTrampolinePlan(
+    lldb_private::Thread &thread, bool stop_others) {
+  Log *log = GetLog(LLDBLog::Step);
+  LLDB_LOGF(log, "DynamicLoaderFreeBSDKernel::GetStepThroughTrampolinePlan is "
+                 "not yet implemented.");
+  return {};
+}
+
+Status DynamicLoaderFreeBSDKernel::CanLoadImage() {
+  Status error("shared object cannot be loaded into kernel");
+  return error;
+}
diff --git a/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.h b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.h
new file mode 100644
index 00000000000000..d8656e9c49dfe2
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.h
@@ -0,0 +1,171 @@
+//===-- DynamicLoaderFreeBSDKernel.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_DYNAMICLOADER_FREEBSD_KERNEL_DYNAMICLOADERFREEBSDKERNEL_H
+#define LLDB_SOURCE_PLUGINS_DYNAMICLOADER_FREEBSD_KERNEL_DYNAMICLOADERFREEBSDKERNEL_H
+
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "lldb/Target/DynamicLoader.h"
+#include "lldb/Target/Process.h"
+#include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/UUID.h"
+#include "llvm/BinaryFormat/ELF.h"
+
+class DynamicLoaderFreeBSDKernel : public lldb_private::DynamicLoader {
+public:
+  DynamicLoaderFreeBSDKernel(lldb_private::Process *process,
+                             lldb::addr_t kernel_addr);
+
+  ~DynamicLoaderFreeBSDKernel() override;
+
+  // Static Functions
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() { return "freebsd-kernel"; }
+
+  static llvm::StringRef GetPluginDescriptionStatic();
+
+  static lldb_private::DynamicLoader *
+  CreateInstance(lldb_private::Process *process, bool force);
+
+  static void DebuggerInit(lldb_private::Debugger &debugger);
+
+  static lldb::addr_t FindFreeBSDKernel(lldb_private::Process *process);
+
+  // Hooks for time point that after attach to some proccess
+  void DidAttach() override;
+
+  void DidLaunch() override;
+
+  lldb::ThreadPlanSP GetStepThroughTrampolinePlan(lldb_private::Thread &thread,
+                                                  bool stop_others) override;
+
+  lldb_private::Status CanLoadImage() override;
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
+protected:
+  class KModImageInfo {
+  public:
+    KModImageInfo()
+        : m_module_sp(), m_memory_module_sp(), m_uuid(), m_name(), m_path() {}
+
+    void Clear() {
+      m_load_address = LLDB_INVALID_ADDRESS;
+      m_name.clear();
+      m_uuid.Clear();
+      m_module_sp.reset();
+      m_memory_module_sp.reset();
+      m_stop_id = UINT32_MAX;
+      m_path.clear();
+    }
+
+    void SetLoadAddress(lldb::addr_t load_address) {
+      m_load_address = load_address;
+    }
+
+    lldb::addr_t GetLoadAddress() const { return m_load_address; }
+
+    void SetUUID(const lldb_private::UUID uuid) { m_uuid = uuid; }
+
+    lldb_private::UUID GetUUID() const { return m_uuid; }
+
+    void SetName(const char *name) { m_name = name; }
+
+    std::string GetName() const { return m_name; }
+
+    void SetPath(const char *path) { m_path = path; }
+
+    std::string GetPath() const { return m_path; }
+
+    void SetModule(lldb::ModuleSP module) { m_module_sp = module; }
+
+    lldb::ModuleSP GetModule() { return m_module_sp; }
+
+    void SetIsKernel(bool is_kernel) { m_is_kernel = is_kernel; }
+
+    bool IsKernel() const { return m_is_kernel; };
+
+    void SetStopID(uint32_t stop_id) { m_stop_id = stop_id; }
+
+    uint32_t GetStopID() { return m_stop_id; }
+
+    bool IsLoaded() const { return m_stop_id != UINT32_MAX; };
+
+    bool ReadMemoryModule(lldb_private::Process *process);
+
+    bool LoadImageUsingMemoryModule(lldb_private::Process *process);
+
+    bool LoadImageUsingFileAddress(lldb_private::Process *process);
+
+    using collection_type = std::vector<KModImageInfo>;
+
+  private:
+    lldb::ModuleSP m_module_sp;
+    lldb::ModuleSP m_memory_module_sp;
+    lldb::addr_t m_load_address = LLDB_INVALID_ADDRESS;
+    lldb_private::UUID m_uuid;
+    bool m_is_kernel = false;
+    std::string m_name;
+    std::string m_path;
+    uint32_t m_stop_id = UINT32_MAX;
+  };
+
+  void PrivateInitialize(lldb_private::Process *process);
+
+  void Clear(bool clear_process);
+
+  void Update();
+
+  void LoadKernelModules();
+
+  void ReadAllKmods();
+
+  bool ReadAllKmods(lldb_private::Address linker_files_head_address,
+                    KModImageInfo::collection_type &kmods_list);
+
+  bool ReadKmodsListHeader();
+
+  bool ParseKmods(lldb_private::Address linker_files_head_address);
+
+  void SetNotificationBreakPoint();
+
+  static lldb_private::UUID
+  CheckForKernelImageAtAddress(lldb_private::Process *process,
+                               lldb::addr_t address,
+                               bool *read_error = nullptr);
+
+  static lldb::addr_t FindKernelAtLoadAddress(lldb_private::Process *process);
+
+  static bool ReadELFHeader(lldb_private::Process *process,
+                            lldb::addr_t address, llvm::ELF::Elf32_Ehdr &header,
+                            bool *read_error = nullptr);
+
+  lldb_private::Process *m_process;
+  lldb_private::Address m_linker_file_list_struct_addr;
+  lldb_private::Address m_linker_file_head_addr;
+  lldb::addr_t m_kernel_load_address;
+  KModImageInfo m_kernel_image_info;
+  KModImageInfo::collection_type m_linker_files_list;
+  std::recursive_mutex m_mutex;
+  std::unordered_map<std::string, lldb_private::UUID> m_kld_name_to_uuid;
+
+private:
+  DynamicLoaderFreeBSDKernel(const DynamicLoaderFreeBSDKernel &) = delete;
+
+  const DynamicLoaderFreeBSDKernel &
+  operator=(const DynamicLoaderFreeBSDKernel &) = delete;
+};
+
+#endif
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index c1743a5e0a418d..ad6d627938c052 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -332,14 +332,12 @@ bool CPlusPlusLanguage::MethodName::ContainsPath(llvm::StringRef path) {
   // If we can't parse the incoming name, then just check that it contains path.
   if (m_parse_error)
     return m_full.GetStringRef().contains(path);
-    
+
   llvm::StringRef identifier;
   llvm::StringRef context;
   std::string path_str = path.str();
-  bool success 
-      = CPlusPlusLanguage::ExtractContextAndIdentifier(path_str.c_str(),
-                                                       context,
-                                                       identifier);
+  bool success = CPlusPlusLanguage::ExtractContextAndIdentifier(
+      path_str.c_str(), context, identifier);
   if (!success)
     return m_full.GetStringRef().contains(path);
 
@@ -372,7 +370,7 @@ bool CPlusPlusLanguage::MethodName::ContainsPath(llvm::StringRef path) {
     return false;
   if (haystack.empty() || !isalnum(haystack.back()))
     return true;
-    
+
   return false;
 }
 
@@ -388,7 +386,7 @@ bool CPlusPlusLanguage::IsCPPMangledName(llvm::StringRef name) {
   return true;
 }
 
-bool CPlusPlusLanguage::DemangledNameContainsPath(llvm::StringRef path, 
+bool CPlusPlusLanguage::DemangledNameContainsPath(llvm::StringRef path,
                                                   ConstString demangled) const {
   MethodName demangled_name(demangled);
   return demangled_name.ContainsPath(path);
@@ -1104,6 +1102,11 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       SyntheticChildrenSP(new ScriptedSyntheticChildren(
           stl_synth_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdForwardListSynthProvider")));
+  cpp_category_sp->AddTypeSynthetic(
+      "^std::variant<.+>$", eFormatterMatchRegex,
+      SyntheticChildrenSP(new ScriptedSyntheticChildren(
+          stl_synth_flags,
+          "lldb.formatters.cpp.gnu_libstdcpp.VariantSynthProvider")));
 
   stl_summary_flags.SetDontShowChildren(false);
   stl_summary_flags.SetSkipPointers(false);
@@ -1148,6 +1151,11 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       TypeSummaryImplSP(new ScriptSummaryFormat(
           stl_summary_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.ForwardListSummaryProvider")));
+  cpp_category_sp->AddTypeSummary(
+      "^std::variant<.+>$", eFormatterMatchRegex,
+      TypeSummaryImplSP(new ScriptSummaryFormat(
+          stl_summary_flags,
+          "lldb.formatters.cpp.gnu_libstdcpp.VariantSummaryProvider")));
 
   AddCXXSynthetic(
       cpp_category_sp,
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 2da971dff895b4..43ab87f08e1925 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -935,6 +935,16 @@ lldb_private::Address ObjectFileELF::GetEntryPointAddress() {
 }
 
 Address ObjectFileELF::GetBaseAddress() {
+  if (GetType() == ObjectFile::eTypeObjectFile) {
+    for (SectionHeaderCollIter I = std::next(m_section_headers.begin());
+         I != m_section_headers.end(); ++I) {
+      const ELFSectionHeaderInfo &header = *I;
+      if (header.sh_flags & SHF_ALLOC)
+        return Address(GetSectionList()->FindSectionByID(SectionIndex(I)), 0);
+    }
+    return LLDB_INVALID_ADDRESS;
+  }
+
   for (const auto &EnumPHdr : llvm::enumerate(ProgramHeaders())) {
     const ELFProgramHeader &H = EnumPHdr.value();
     if (H.p_type != PT_LOAD)
@@ -1764,7 +1774,12 @@ class VMAddressProvider {
   VMRange GetVMRange(const ELFSectionHeader &H) {
     addr_t Address = H.sh_addr;
     addr_t Size = H.sh_flags & SHF_ALLOC ? H.sh_size : 0;
-    if (ObjectType == ObjectFile::Type::eTypeObjectFile && Segments.empty() && (H.sh_flags & SHF_ALLOC)) {
+
+    // When this is a debug file for relocatable file, the address is all zero
+    // and thus needs to use accumulate method
+    if ((ObjectType == ObjectFile::Type::eTypeObjectFile ||
+         (ObjectType == ObjectFile::Type::eTypeDebugInfo && H.sh_addr == 0)) &&
+        Segments.empty() && (H.sh_flags & SHF_ALLOC)) {
       NextVMAddress =
           llvm::alignTo(NextVMAddress, std::max<addr_t>(H.sh_addralign, 1));
       Address = NextVMAddress;
@@ -3454,10 +3469,28 @@ ObjectFile::Strata ObjectFileELF::CalculateStrata() {
 
   case llvm::ELF::ET_EXEC:
     // 2 - Executable file
-    // TODO: is there any way to detect that an executable is a kernel
-    // related executable by inspecting the program headers, section headers,
-    // symbols, or any other flag bits???
-    return eStrataUser;
+    {
+      SectionList *section_list = GetSectionList();
+      if (section_list) {
+        static ConstString loader_section_name(".interp");
+        SectionSP loader_section =
+            section_list->FindSectionByName(loader_section_name);
+        if (loader_section) {
+          char buffer[256];
+          size_t read_size =
+              ReadSectionData(loader_section.get(), 0, buffer, sizeof(buffer));
+
+          // We compare the content of .interp section
+          // It will contains \0 when counting read_size, so the size needs to
+          // decrease by one
+          llvm::StringRef loader_name(buffer, read_size - 1);
+          llvm::StringRef freebsd_kernel_loader_name("/red/herring");
+          if (loader_name.equals(freebsd_kernel_loader_name))
+            return eStrataKernel;
+        }
+      }
+      return eStrataUser;
+    }
 
   case llvm::ELF::ET_DYN:
     // 3 - Shared object file
diff --git a/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp b/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
index e3707365a9c3f1..601f5df43dbba4 100644
--- a/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
+++ b/lldb/source/Plugins/Process/FreeBSDKernel/ProcessFreeBSDKernel.cpp
@@ -10,7 +10,7 @@
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Target/DynamicLoader.h"
 
-#include "Plugins/DynamicLoader/Static/DynamicLoaderStatic.h"
+#include "Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.h"
 #include "ProcessFreeBSDKernel.h"
 #include "ThreadFreeBSDKernel.h"
 
@@ -262,7 +262,7 @@ Status ProcessFreeBSDKernel::DoLoadCore() {
 DynamicLoader *ProcessFreeBSDKernel::GetDynamicLoader() {
   if (m_dyld_up.get() == nullptr)
     m_dyld_up.reset(DynamicLoader::FindPlugin(
-        this, DynamicLoaderStatic::GetPluginNameStatic()));
+        this, DynamicLoaderFreeBSDKernel::GetPluginNameStatic()));
   return m_dyld_up.get();
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/CMakeLists.txt b/lldb/source/Plugins/SymbolFile/DWARF/CMakeLists.txt
index dad20604006871..0e4fd5b995d1ba 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolFile/DWARF/CMakeLists.txt
@@ -17,7 +17,6 @@ add_lldb_library(lldbPluginSymbolFileDWARF PLUGIN
   DWARFCompileUnit.cpp
   DWARFContext.cpp
   DWARFDataExtractor.cpp
-  DWARFDebugAbbrev.cpp
   DWARFDebugAranges.cpp
   DWARFDebugArangeSet.cpp
   DWARFDebugInfo.cpp
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 37fb16d4e0351c..6e13626d289431 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -519,6 +519,33 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc,
   return UpdateSymbolContextScopeForType(sc, die, type_sp);
 }
 
+static std::optional<uint32_t>
+ExtractDataMemberLocation(DWARFDIE const &die, DWARFFormValue const &form_value,
+                          ModuleSP module_sp) {
+  // With DWARF 3 and later, if the value is an integer constant,
+  // this form value is the offset in bytes from the beginning of
+  // the containing entity.
+  if (!form_value.BlockData())
+    return form_value.Unsigned();
+
+  Value initialValue(0);
+  Value memberOffset(0);
+  const DWARFDataExtractor &debug_info_data = die.GetData();
+  uint32_t block_length = form_value.Unsigned();
+  uint32_t block_offset =
+      form_value.BlockData() - debug_info_data.GetDataStart();
+  if (!DWARFExpression::Evaluate(
+          nullptr, // ExecutionContext *
+          nullptr, // RegisterContext *
+          module_sp, DataExtractor(debug_info_data, block_offset, block_length),
+          die.GetCU(), eRegisterKindDWARF, &initialValue, nullptr, memberOffset,
+          nullptr)) {
+    return {};
+  }
+
+  return memberOffset.ResolveValue(nullptr).UInt();
+}
+
 lldb::TypeSP
 DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
                                        const DWARFDIE &die,
@@ -1406,26 +1433,9 @@ void DWARFASTParserClang::ParseInheritance(
         encoding_form = form_value;
         break;
       case DW_AT_data_member_location:
-        if (form_value.BlockData()) {
-          Value initialValue(0);
-          Value memberOffset(0);
-          const DWARFDataExtractor &debug_info_data = die.GetData();
-          uint32_t block_length = form_value.Unsigned();
-          uint32_t block_offset =
-              form_value.BlockData() - debug_info_data.GetDataStart();
-          if (DWARFExpression::Evaluate(
-                  nullptr, nullptr, module_sp,
-                  DataExtractor(debug_info_data, block_offset, block_length),
-                  die.GetCU(), eRegisterKindDWARF, &initialValue, nullptr,
-                  memberOffset, nullptr)) {
-            member_byte_offset = memberOffset.ResolveValue(nullptr).UInt();
-          }
-        } else {
-          // With DWARF 3 and later, if the value is an integer constant,
-          // this form value is the offset in bytes from the beginning of
-          // the containing entity.
-          member_byte_offset = form_value.Unsigned();
-        }
+        if (auto maybe_offset =
+                ExtractDataMemberLocation(die, form_value, module_sp))
+          member_byte_offset = *maybe_offset;
         break;
 
       case DW_AT_accessibility:
@@ -2482,8 +2492,9 @@ struct MemberAttributes {
   DWARFFormValue encoding_form;
   /// Indicates the byte offset of the word from the base address of the
   /// structure.
-  uint32_t member_byte_offset;
+  uint32_t member_byte_offset = UINT32_MAX;
   bool is_artificial = false;
+  bool is_declaration = false;
 };
 
 /// Parsed form of all attributes that are relevant for parsing Objective-C
@@ -2557,29 +2568,9 @@ VariantMember::VariantMember(DWARFDIE &die, lldb::ModuleSP module_sp) {
             break;
 
           case DW_AT_data_member_location:
-            if (form_value.BlockData()) {
-              Value initialValue(0);
-              Value memberOffset(0);
-              const DWARFDataExtractor &debug_info_data = die.GetData();
-              uint32_t block_length = form_value.Unsigned();
-              uint32_t block_offset =
-                  form_value.BlockData() - debug_info_data.GetDataStart();
-              if (DWARFExpression::Evaluate(
-                      nullptr, // ExecutionContext *
-                      nullptr, // RegisterContext *
-                      module_sp,
-                      DataExtractor(debug_info_data, block_offset,
-                                    block_length),
-                      die.GetCU(), eRegisterKindDWARF, &initialValue, nullptr,
-                      memberOffset, nullptr)) {
-                byte_offset = memberOffset.ResolveValue(nullptr).UInt();
-              }
-            } else {
-              // With DWARF 3 and later, if the value is an integer constant,
-              // this form value is the offset in bytes from the beginning of
-              // the containing entity.
-              byte_offset = form_value.Unsigned();
-            }
+            if (auto maybe_offset =
+                    ExtractDataMemberLocation(die, form_value, module_sp))
+              byte_offset = *maybe_offset;
             break;
 
           default:
@@ -2608,28 +2599,9 @@ DiscriminantValue::DiscriminantValue(const DWARFDIE &die, ModuleSP module_sp) {
         type_ref = form_value;
         break;
       case DW_AT_data_member_location:
-        if (form_value.BlockData()) {
-          Value initialValue(0);
-          Value memberOffset(0);
-          const DWARFDataExtractor &debug_info_data = die.GetData();
-          uint32_t block_length = form_value.Unsigned();
-          uint32_t block_offset =
-              form_value.BlockData() - debug_info_data.GetDataStart();
-          if (DWARFExpression::Evaluate(
-                  nullptr, // ExecutionContext *
-                  nullptr, // RegisterContext *
-                  module_sp,
-                  DataExtractor(debug_info_data, block_offset, block_length),
-                  die.GetCU(), eRegisterKindDWARF, &initialValue, nullptr,
-                  memberOffset, nullptr)) {
-            byte_offset = memberOffset.ResolveValue(nullptr).UInt();
-          }
-        } else {
-          // With DWARF 3 and later, if the value is an integer constant,
-          // this form value is the offset in bytes from the beginning of
-          // the containing entity.
-          byte_offset = form_value.Unsigned();
-        }
+        if (auto maybe_offset =
+                ExtractDataMemberLocation(die, form_value, module_sp))
+          byte_offset = *maybe_offset;
         break;
       default:
         break;
@@ -2656,8 +2628,6 @@ DiscriminantValue &VariantPart::discriminant() { return this->_discriminant; }
 MemberAttributes::MemberAttributes(const DWARFDIE &die,
                                    const DWARFDIE &parent_die,
                                    ModuleSP module_sp) {
-  member_byte_offset = (parent_die.Tag() == DW_TAG_union_type) ? 0 : UINT32_MAX;
-
   DWARFAttributes attributes = die.GetAttributes();
   for (size_t i = 0; i < attributes.Size(); ++i) {
     const dw_attr_t attr = attributes.AttributeAtIndex(i);
@@ -2686,28 +2656,9 @@ MemberAttributes::MemberAttributes(const DWARFDIE &die,
         data_bit_offset = form_value.Unsigned();
         break;
       case DW_AT_data_member_location:
-        if (form_value.BlockData()) {
-          Value initialValue(0);
-          Value memberOffset(0);
-          const DWARFDataExtractor &debug_info_data = die.GetData();
-          uint32_t block_length = form_value.Unsigned();
-          uint32_t block_offset =
-              form_value.BlockData() - debug_info_data.GetDataStart();
-          if (DWARFExpression::Evaluate(
-                  nullptr, // ExecutionContext *
-                  nullptr, // RegisterContext *
-                  module_sp,
-                  DataExtractor(debug_info_data, block_offset, block_length),
-                  die.GetCU(), eRegisterKindDWARF, &initialValue, nullptr,
-                  memberOffset, nullptr)) {
-            member_byte_offset = memberOffset.ResolveValue(nullptr).UInt();
-          }
-        } else {
-          // With DWARF 3 and later, if the value is an integer constant,
-          // this form value is the offset in bytes from the beginning of
-          // the containing entity.
-          member_byte_offset = form_value.Unsigned();
-        }
+        if (auto maybe_offset =
+                ExtractDataMemberLocation(die, form_value, module_sp))
+          member_byte_offset = *maybe_offset;
         break;
 
       case DW_AT_accessibility:
@@ -2717,6 +2668,9 @@ MemberAttributes::MemberAttributes(const DWARFDIE &die,
       case DW_AT_artificial:
         is_artificial = form_value.Boolean();
         break;
+      case DW_AT_declaration:
+        is_declaration = form_value.Boolean();
+        break;
       default:
         break;
       }
@@ -2923,10 +2877,18 @@ void DWARFASTParserClang::ParseSingleMember(
   if (class_is_objc_object_or_interface)
     attrs.accessibility = eAccessNone;
 
-  // Handle static members, which is any member that doesn't have a bit or a
-  // byte member offset.
+  // Handle static members, which are typically members without
+  // locations. However, GCC *never* emits DW_AT_data_member_location
+  // for static data members of unions.
+  // Non-normative text pre-DWARFv5 recommends marking static
+  // data members with an DW_AT_external flag. Clang emits this consistently
+  // whereas GCC emits it only for static data members if not part of an
+  // anonymous namespace. The flag that is consistently emitted for static
+  // data members is DW_AT_declaration, so we check it instead.
+  // FIXME: Since DWARFv5, static data members are marked DW_AT_variable so we
+  // can consistently detect them on both GCC and Clang without below heuristic.
   if (attrs.member_byte_offset == UINT32_MAX &&
-      attrs.data_bit_offset == UINT64_MAX) {
+      attrs.data_bit_offset == UINT64_MAX && attrs.is_declaration) {
     Type *var_type = die.ResolveTypeUID(attrs.encoding_form.Reference());
 
     if (var_type) {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
index ab3017ba0ffcbc..65debac4c7d926 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
@@ -12,6 +12,10 @@
 #include "DWARFUnit.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
+class DWARFAbbreviationDeclarationSet;
+}
+
 class DWARFCompileUnit : public DWARFUnit {
 public:
   void BuildAddressRangeTable(DWARFDebugAranges *debug_aranges) override;
@@ -27,7 +31,7 @@ class DWARFCompileUnit : public DWARFUnit {
 private:
   DWARFCompileUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                    const DWARFUnitHeader &header,
-                   const DWARFAbbreviationDeclarationSet &abbrevs,
+                   const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                    DIERef::Section section, bool is_dwo)
       : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAbbrev.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAbbrev.cpp
deleted file mode 100644
index f3c2755c5a527c..00000000000000
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAbbrev.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//===-- DWARFDebugAbbrev.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "DWARFDebugAbbrev.h"
-#include "DWARFDataExtractor.h"
-#include "DWARFFormValue.h"
-#include "lldb/Utility/Stream.h"
-
-using namespace lldb;
-using namespace lldb_private;
-
-// DWARFDebugAbbrev constructor
-DWARFDebugAbbrev::DWARFDebugAbbrev(const DWARFDataExtractor &data)
-    : m_abbrevCollMap(), m_prev_abbr_offset_pos(m_abbrevCollMap.end()),
-      m_data(data.GetAsLLVM()) {}
-
-// DWARFDebugAbbrev::Parse()
-llvm::Error DWARFDebugAbbrev::parse() {
-  if (!m_data)
-    return llvm::Error::success();
-
-  lldb::offset_t offset = 0;
-
-  while (m_data->isValidOffset(offset)) {
-    uint32_t initial_cu_offset = offset;
-    DWARFAbbreviationDeclarationSet abbrevDeclSet;
-
-    llvm::Error error = abbrevDeclSet.extract(*m_data, &offset);
-    if (error) {
-      m_data = std::nullopt;
-      return error;
-    }
-
-    m_abbrevCollMap[initial_cu_offset] = abbrevDeclSet;
-  }
-  m_data = std::nullopt;
-  m_prev_abbr_offset_pos = m_abbrevCollMap.end();
-  return llvm::ErrorSuccess();
-}
-
-// DWARFDebugAbbrev::GetAbbreviationDeclarationSet()
-const DWARFAbbreviationDeclarationSet *
-DWARFDebugAbbrev::GetAbbreviationDeclarationSet(
-    dw_offset_t cu_abbr_offset) const {
-  DWARFAbbreviationDeclarationCollMapConstIter end = m_abbrevCollMap.end();
-  DWARFAbbreviationDeclarationCollMapConstIter pos;
-  if (m_prev_abbr_offset_pos != end &&
-      m_prev_abbr_offset_pos->first == cu_abbr_offset)
-    return &(m_prev_abbr_offset_pos->second);
-  else {
-    pos = m_abbrevCollMap.find(cu_abbr_offset);
-    m_prev_abbr_offset_pos = pos;
-  }
-
-  if (pos != m_abbrevCollMap.end())
-    return &(pos->second);
-  return nullptr;
-}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAbbrev.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAbbrev.h
deleted file mode 100644
index d2fade0934c8a8..00000000000000
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugAbbrev.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//===-- DWARFDebugAbbrev.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLDB_SOURCE_PLUGINS_SYMBOLFILE_DWARF_DWARFDEBUGABBREV_H
-#define LLDB_SOURCE_PLUGINS_SYMBOLFILE_DWARF_DWARFDEBUGABBREV_H
-
-#include "DWARFDefines.h"
-#include "lldb/lldb-private.h"
-
-#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
-
-#include <map>
-
-using DWARFAbbreviationDeclaration = llvm::DWARFAbbreviationDeclaration;
-using DWARFAbbreviationDeclarationSet = llvm::DWARFAbbreviationDeclarationSet;
-
-typedef std::map<dw_offset_t, DWARFAbbreviationDeclarationSet>
-    DWARFAbbreviationDeclarationCollMap;
-typedef DWARFAbbreviationDeclarationCollMap::iterator
-    DWARFAbbreviationDeclarationCollMapIter;
-typedef DWARFAbbreviationDeclarationCollMap::const_iterator
-    DWARFAbbreviationDeclarationCollMapConstIter;
-
-class DWARFDebugAbbrev {
-public:
-  DWARFDebugAbbrev(const lldb_private::DWARFDataExtractor &data);
-  const DWARFAbbreviationDeclarationSet *
-  GetAbbreviationDeclarationSet(dw_offset_t cu_abbr_offset) const;
-  /// Extract all abbreviations for a particular compile unit.  Returns
-  /// llvm::ErrorSuccess() on success, and an appropriate llvm::Error object
-  /// otherwise.
-  llvm::Error parse();
-
-  DWARFAbbreviationDeclarationCollMapConstIter begin() const {
-    assert(!m_data && "Must call parse before iterating over DWARFDebugAbbrev");
-    return m_abbrevCollMap.begin();
-  }
-
-  DWARFAbbreviationDeclarationCollMapConstIter end() const {
-    return m_abbrevCollMap.end();
-  }
-
-protected:
-  DWARFAbbreviationDeclarationCollMap m_abbrevCollMap;
-  mutable DWARFAbbreviationDeclarationCollMapConstIter m_prev_abbr_offset_pos;
-  mutable std::optional<llvm::DataExtractor> m_data;
-};
-
-#endif // LLDB_SOURCE_PLUGINS_SYMBOLFILE_DWARF_DWARFDEBUGABBREV_H
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index a08637aef06697..a6ab83700904cb 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -22,7 +22,6 @@
 #include "lldb/Utility/StreamString.h"
 
 #include "DWARFCompileUnit.h"
-#include "DWARFDebugAbbrev.h"
 #include "DWARFDebugAranges.h"
 #include "DWARFDebugInfo.h"
 #include "DWARFDebugRanges.h"
@@ -32,6 +31,8 @@
 #include "SymbolFileDWARF.h"
 #include "SymbolFileDWARFDwo.h"
 
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+
 using namespace lldb_private;
 using namespace lldb_private::dwarf;
 extern int g_verbose;
@@ -810,12 +811,13 @@ lldb::offset_t DWARFDebugInfoEntry::GetFirstAttributeOffset() const {
   return GetOffset() + llvm::getULEB128Size(m_abbr_idx);
 }
 
-const DWARFAbbreviationDeclaration *
+const llvm::DWARFAbbreviationDeclaration *
 DWARFDebugInfoEntry::GetAbbreviationDeclarationPtr(const DWARFUnit *cu) const {
   if (!cu)
     return nullptr;
 
-  const DWARFAbbreviationDeclarationSet *abbrev_set = cu->GetAbbreviations();
+  const llvm::DWARFAbbreviationDeclarationSet *abbrev_set =
+      cu->GetAbbreviations();
   if (!abbrev_set)
     return nullptr;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
index c2ea40065232e7..29db44a16bb128 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
@@ -14,7 +14,6 @@
 
 #include "DWARFAttribute.h"
 #include "DWARFBaseDIE.h"
-#include "DWARFDebugAbbrev.h"
 #include "DWARFDebugRanges.h"
 #include <map>
 #include <optional>
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
index 5e4d48ab285a9d..5d939582a312e9 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
@@ -12,6 +12,10 @@
 #include "DWARFUnit.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
+class DWARFAbbreviationDeclarationSet;
+}
+
 class DWARFTypeUnit : public DWARFUnit {
 public:
   void BuildAddressRangeTable(DWARFDebugAranges *debug_aranges) override {}
@@ -27,7 +31,7 @@ class DWARFTypeUnit : public DWARFUnit {
 private:
   DWARFTypeUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                 const DWARFUnitHeader &header,
-                const DWARFAbbreviationDeclarationSet &abbrevs,
+                const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                 DIERef::Section section, bool is_dwo)
       : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 749ffcb094ecfd..a09c68087c4765 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -13,6 +13,7 @@
 #include "lldb/Utility/LLDBAssert.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/Timer.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/Object/Error.h"
 
@@ -32,7 +33,7 @@ extern int g_verbose;
 
 DWARFUnit::DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                      const DWARFUnitHeader &header,
-                     const DWARFAbbreviationDeclarationSet &abbrevs,
+                     const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                      DIERef::Section section, bool is_dwo)
     : UserID(uid), m_dwarf(dwarf), m_header(header), m_abbrevs(&abbrevs),
       m_cancel_scopes(false), m_section(section), m_is_dwo(is_dwo),
@@ -435,7 +436,8 @@ size_t DWARFUnit::GetDebugInfoSize() const {
   return GetLengthByteSize() + GetLength() - GetHeaderByteSize();
 }
 
-const DWARFAbbreviationDeclarationSet *DWARFUnit::GetAbbreviations() const {
+const llvm::DWARFAbbreviationDeclarationSet *
+DWARFUnit::GetAbbreviations() const {
   return m_abbrevs;
 }
 
@@ -875,11 +877,37 @@ const DWARFDebugAranges &DWARFUnit::GetFunctionAranges() {
   return *m_func_aranges_up;
 }
 
-llvm::Expected<DWARFUnitHeader>
-DWARFUnitHeader::extract(const DWARFDataExtractor &data,
-                         DIERef::Section section,
-                         lldb_private::DWARFContext &context,
-                         lldb::offset_t *offset_ptr) {
+llvm::Error DWARFUnitHeader::ApplyIndexEntry(
+    const llvm::DWARFUnitIndex::Entry *index_entry) {
+  // We should only be calling this function when the index entry is not set and
+  // we have a valid one to set it to.
+  assert(index_entry);
+  assert(!m_index_entry);
+
+  if (m_abbr_offset)
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "Package unit with a non-zero abbreviation offset");
+
+  auto *unit_contrib = index_entry->getContribution();
+  if (!unit_contrib || unit_contrib->getLength32() != m_length + 4)
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "Inconsistent DWARF package unit index");
+
+  auto *abbr_entry = index_entry->getContribution(llvm::DW_SECT_ABBREV);
+  if (!abbr_entry)
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "DWARF package index missing abbreviation column");
+
+  m_abbr_offset = abbr_entry->getOffset();
+  m_index_entry = index_entry;
+  return llvm::Error::success();
+}
+
+llvm::Expected<DWARFUnitHeader> DWARFUnitHeader::extract(
+    const DWARFDataExtractor &data, DIERef::Section section,
+    lldb_private::DWARFContext &context, lldb::offset_t *offset_ptr) {
   DWARFUnitHeader header;
   header.m_offset = *offset_ptr;
   header.m_length = data.GetDWARFInitialLength(offset_ptr);
@@ -903,42 +931,6 @@ DWARFUnitHeader::extract(const DWARFDataExtractor &data,
     header.m_type_offset = data.GetDWARFOffset(offset_ptr);
   }
 
-  if (context.isDwo()) {
-    const llvm::DWARFUnitIndex *Index;
-    if (header.IsTypeUnit()) {
-      Index = &context.GetAsLLVM().getTUIndex();
-      if (*Index)
-        header.m_index_entry = Index->getFromHash(header.m_type_hash);
-    } else {
-      Index = &context.GetAsLLVM().getCUIndex();
-      if (*Index && header.m_version >= 5 && header.m_dwo_id)
-        header.m_index_entry = Index->getFromHash(*header.m_dwo_id);
-    }
-    if (!header.m_index_entry)
-      header.m_index_entry = Index->getFromOffset(header.m_offset);
-  }
-
-  if (header.m_index_entry) {
-    if (header.m_abbr_offset) {
-      return llvm::createStringError(
-          llvm::inconvertibleErrorCode(),
-          "Package unit with a non-zero abbreviation offset");
-    }
-    auto *unit_contrib = header.m_index_entry->getContribution();
-    if (!unit_contrib || unit_contrib->getLength32() != header.m_length + 4) {
-      return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                     "Inconsistent DWARF package unit index");
-    }
-    auto *abbr_entry =
-        header.m_index_entry->getContribution(llvm::DW_SECT_ABBREV);
-    if (!abbr_entry) {
-      return llvm::createStringError(
-          llvm::inconvertibleErrorCode(),
-          "DWARF package index missing abbreviation column");
-    }
-    header.m_abbr_offset = abbr_entry->getOffset();
-  }
-
   bool length_OK = data.ValidOffset(header.GetNextUnitOffset() - 1);
   bool version_OK = SymbolFileDWARF::SupportedVersion(header.m_version);
   bool addr_size_OK = (header.m_addr_size == 2) || (header.m_addr_size == 4) ||
@@ -968,12 +960,31 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
                    DIERef::Section section, lldb::offset_t *offset_ptr) {
   assert(debug_info.ValidOffset(*offset_ptr));
 
-  auto expected_header = DWARFUnitHeader::extract(
-      debug_info, section, dwarf.GetDWARFContext(), offset_ptr);
+  DWARFContext &context = dwarf.GetDWARFContext();
+  auto expected_header =
+      DWARFUnitHeader::extract(debug_info, section, context, offset_ptr);
   if (!expected_header)
     return expected_header.takeError();
 
-  const DWARFDebugAbbrev *abbr = dwarf.DebugAbbrev();
+  if (context.isDwo()) {
+    const llvm::DWARFUnitIndex::Entry *entry = nullptr;
+    const llvm::DWARFUnitIndex &index = expected_header->IsTypeUnit()
+                                            ? context.GetAsLLVM().getTUIndex()
+                                            : context.GetAsLLVM().getCUIndex();
+    if (index) {
+      if (expected_header->IsTypeUnit())
+        entry = index.getFromHash(expected_header->GetTypeHash());
+      else if (auto dwo_id = expected_header->GetDWOId())
+        entry = index.getFromHash(*dwo_id);
+    }
+    if (!entry)
+      entry = index.getFromOffset(expected_header->GetOffset());
+    if (entry)
+      if (llvm::Error err = expected_header->ApplyIndexEntry(entry))
+        return std::move(err);
+  }
+
+  const llvm::DWARFDebugAbbrev *abbr = dwarf.DebugAbbrev();
   if (!abbr)
     return llvm::make_error<llvm::object::GenericBinaryError>(
         "No debug_abbrev data");
@@ -985,8 +996,12 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
     return llvm::make_error<llvm::object::GenericBinaryError>(
         "Abbreviation offset for unit is not valid");
 
-  const DWARFAbbreviationDeclarationSet *abbrevs =
-      abbr->GetAbbreviationDeclarationSet(expected_header->GetAbbrOffset());
+  llvm::Expected<const llvm::DWARFAbbreviationDeclarationSet *> abbrevs_or_err =
+      abbr->getAbbreviationDeclarationSet(expected_header->GetAbbrOffset());
+  if (!abbrevs_or_err)
+    return abbrevs_or_err.takeError();
+
+  const llvm::DWARFAbbreviationDeclarationSet *abbrevs = *abbrevs_or_err;
   if (!abbrevs)
     return llvm::make_error<llvm::object::GenericBinaryError>(
         "No abbrev exists at the specified offset.");
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index bc55b093e894ed..20871d805e77a8 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -13,6 +13,7 @@
 #include "DWARFDebugInfoEntry.h"
 #include "lldb/Utility/XcodeSDK.h"
 #include "lldb/lldb-enumerations.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRnglists.h"
 #include "llvm/Support/RWMutex.h"
 #include <atomic>
@@ -75,6 +76,8 @@ class DWARFUnitHeader {
   }
   uint32_t GetNextUnitOffset() const { return m_offset + m_length + 4; }
 
+  llvm::Error ApplyIndexEntry(const llvm::DWARFUnitIndex::Entry *index_entry);
+
   static llvm::Expected<DWARFUnitHeader>
   extract(const lldb_private::DWARFDataExtractor &data, DIERef::Section section,
           lldb_private::DWARFContext &dwarf_context,
@@ -153,7 +156,7 @@ class DWARFUnit : public lldb_private::UserID {
   // Size of the CU data incl. header but without initial length.
   uint32_t GetLength() const { return m_header.GetLength(); }
   uint16_t GetVersion() const { return m_header.GetVersion(); }
-  const DWARFAbbreviationDeclarationSet *GetAbbreviations() const;
+  const llvm::DWARFAbbreviationDeclarationSet *GetAbbreviations() const;
   dw_offset_t GetAbbrevOffset() const;
   uint8_t GetAddressByteSize() const { return m_header.GetAddressByteSize(); }
   dw_addr_t GetAddrBase() const { return m_addr_base.value_or(0); }
@@ -291,7 +294,7 @@ class DWARFUnit : public lldb_private::UserID {
 protected:
   DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
             const DWARFUnitHeader &header,
-            const DWARFAbbreviationDeclarationSet &abbrevs,
+            const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
             DIERef::Section section, bool is_dwo);
 
   llvm::Error ExtractHeader(SymbolFileDWARF &dwarf,
@@ -323,7 +326,7 @@ class DWARFUnit : public lldb_private::UserID {
   SymbolFileDWARF &m_dwarf;
   std::shared_ptr<DWARFUnit> m_dwo;
   DWARFUnitHeader m_header;
-  const DWARFAbbreviationDeclarationSet *m_abbrevs = nullptr;
+  const llvm::DWARFAbbreviationDeclarationSet *m_abbrevs = nullptr;
   void *m_user_data = nullptr;
   // The compile unit debug information entry item
   DWARFDebugInfoEntry::collection m_die_array;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index aae481e2ae7417..e472074545a6f0 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -58,7 +58,6 @@
 #include "DWARFASTParser.h"
 #include "DWARFASTParserClang.h"
 #include "DWARFCompileUnit.h"
-#include "DWARFDebugAbbrev.h"
 #include "DWARFDebugAranges.h"
 #include "DWARFDebugInfo.h"
 #include "DWARFDebugMacro.h"
@@ -74,6 +73,7 @@
 #include "SymbolFileDWARFDwo.h"
 
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -511,7 +511,8 @@ bool SymbolFileDWARF::SupportedVersion(uint16_t version) {
   return version >= 2 && version <= 5;
 }
 
-static std::set<dw_form_t> GetUnsupportedForms(DWARFDebugAbbrev *debug_abbrev) {
+static std::set<dw_form_t>
+GetUnsupportedForms(llvm::DWARFDebugAbbrev *debug_abbrev) {
   if (!debug_abbrev)
     return {};
 
@@ -553,7 +554,7 @@ uint32_t SymbolFileDWARF::CalculateAbilities() {
       if (section)
         debug_abbrev_file_size = section->GetFileSize();
 
-      DWARFDebugAbbrev *abbrev = DebugAbbrev();
+      llvm::DWARFDebugAbbrev *abbrev = DebugAbbrev();
       std::set<dw_form_t> unsupported_forms = GetUnsupportedForms(abbrev);
       if (!unsupported_forms.empty()) {
         StreamString error;
@@ -624,7 +625,7 @@ void SymbolFileDWARF::LoadSectionData(lldb::SectionType sect_type,
   m_objfile_sp->ReadSectionData(section_sp.get(), data);
 }
 
-DWARFDebugAbbrev *SymbolFileDWARF::DebugAbbrev() {
+llvm::DWARFDebugAbbrev *SymbolFileDWARF::DebugAbbrev() {
   if (m_abbr)
     return m_abbr.get();
 
@@ -632,7 +633,8 @@ DWARFDebugAbbrev *SymbolFileDWARF::DebugAbbrev() {
   if (debug_abbrev_data.GetByteSize() == 0)
     return nullptr;
 
-  auto abbr = std::make_unique<DWARFDebugAbbrev>(debug_abbrev_data);
+  auto abbr =
+      std::make_unique<llvm::DWARFDebugAbbrev>(debug_abbrev_data.GetAsLLVM());
   llvm::Error error = abbr->parse();
   if (error) {
     Log *log = GetLog(DWARFLog::DebugInfo);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 191a5abcf265ab..5aaf8bd270ef7b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -41,7 +41,6 @@
 // Forward Declarations for this DWARF plugin
 class DebugMapModule;
 class DWARFCompileUnit;
-class DWARFDebugAbbrev;
 class DWARFDebugAranges;
 class DWARFDebugInfo;
 class DWARFDebugInfoEntry;
@@ -55,6 +54,10 @@ class SymbolFileDWARFDwo;
 class SymbolFileDWARFDwp;
 class UserID;
 
+namespace llvm {
+class DWARFDebugAbbrev;
+}
+
 #define DIE_IS_BEING_PARSED ((lldb_private::Type *)1)
 
 class SymbolFileDWARF : public lldb_private::SymbolFileCommon {
@@ -224,7 +227,7 @@ class SymbolFileDWARF : public lldb_private::SymbolFileCommon {
   // PluginInterface protocol
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 
-  DWARFDebugAbbrev *DebugAbbrev();
+  llvm::DWARFDebugAbbrev *DebugAbbrev();
 
   DWARFDebugInfo &DebugInfo();
 
@@ -536,7 +539,7 @@ class SymbolFileDWARF : public lldb_private::SymbolFileCommon {
   llvm::once_flag m_info_once_flag;
   std::unique_ptr<DWARFDebugInfo> m_info;
 
-  std::unique_ptr<DWARFDebugAbbrev> m_abbr;
+  std::unique_ptr<llvm::DWARFDebugAbbrev> m_abbr;
   std::unique_ptr<GlobalVariableMap> m_global_aranges_up;
 
   typedef std::unordered_map<lldb::offset_t, lldb_private::DebugMacrosSP>
diff --git a/lldb/source/Utility/XcodeSDK.cpp b/lldb/source/Utility/XcodeSDK.cpp
index 84f3ccbd01e2d0..154ddbebe8b30d 100644
--- a/lldb/source/Utility/XcodeSDK.cpp
+++ b/lldb/source/Utility/XcodeSDK.cpp
@@ -56,7 +56,7 @@ XcodeSDK::XcodeSDK(XcodeSDK::Info info) : m_name(GetName(info.type).str()) {
 
 XcodeSDK &XcodeSDK::operator=(const XcodeSDK &other) = default;
 
-bool XcodeSDK::operator==(const XcodeSDK &other) {
+bool XcodeSDK::operator==(const XcodeSDK &other) const {
   return m_name == other.m_name;
 }
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/Makefile
new file mode 100644
index 00000000000000..104f82809c7a35
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/Makefile
@@ -0,0 +1,5 @@
+CXX_SOURCES := main.cpp
+
+USE_LIBSTDCPP := 1
+CXXFLAGS_EXTRAS := -std=c++17
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/TestDataFormatterLibStdcxxVariant.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/TestDataFormatterLibStdcxxVariant.py
new file mode 100644
index 00000000000000..96a9c8d30c45b0
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/TestDataFormatterLibStdcxxVariant.py
@@ -0,0 +1,73 @@
+"""
+Test lldb data formatter for LibStdC++ std::variant.
+"""
+
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class LibStdcxxVariantDataFormatterTestCase(TestBase):
+    @add_test_categories(["libstdcxx"])
+    def test_with_run_command(self):
+        """Test LibStdC++ std::variant data formatter works correctly."""
+        self.build()
+
+        (self.target, self.process, _, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "// break here", lldb.SBFileSpec("main.cpp", False)
+        )
+
+        lldbutil.continue_to_breakpoint(self.process, bkpt)
+
+        self.expect(
+            "frame variable v1",
+            substrs=["v1 =  Active Type = int  {", "Value = 12", "}"],
+        )
+
+        self.expect(
+            "frame variable v1_ref",
+            substrs=["v1_ref =  Active Type = int : {", "Value = 12", "}"],
+        )
+
+        self.expect(
+            "frame variable v_v1",
+            substrs=[
+                "v_v1 =  Active Type = std::variant<int, double, char>  {",
+                "Value =  Active Type = int  {",
+                "Value = 12",
+                "}",
+                "}",
+            ],
+        )
+
+        lldbutil.continue_to_breakpoint(self.process, bkpt)
+
+        self.expect(
+            "frame variable v1",
+            substrs=["v1 =  Active Type = double  {", "Value = 2", "}"],
+        )
+
+        lldbutil.continue_to_breakpoint(self.process, bkpt)
+
+        self.expect(
+            "frame variable v2",
+            substrs=["v2 =  Active Type = double  {", "Value = 2", "}"],
+        )
+
+        self.expect(
+            "frame variable v3",
+            substrs=["v3 =  Active Type = char  {", "Value = 'A'", "}"],
+        )
+        """
+        TODO: temporarily disable No Value tests as they seem to fail on ubuntu/debian
+        bots. Pending reproduce and investigation.
+
+        self.expect("frame variable v_no_value", substrs=["v_no_value =  No Value"])
+
+        self.expect(
+            "frame variable v_many_types_no_value",
+            substrs=["v_many_types_no_value =  No Value"],
+        )
+        """
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/main.cpp
new file mode 100644
index 00000000000000..545318f9358b67
--- /dev/null
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/variant/main.cpp
@@ -0,0 +1,79 @@
+#include <cstdio>
+#include <string>
+#include <variant>
+#include <vector>
+
+struct S {
+  operator int() { throw 42; }
+};
+
+int main() {
+  bool has_variant = true;
+
+  printf("%d\n", has_variant); // break here
+
+  std::variant<int, double, char> v1;
+  std::variant<int, double, char> &v1_ref = v1;
+  std::variant<int, double, char> v2;
+  std::variant<int, double, char> v3;
+  std::variant<std::variant<int, double, char>> v_v1;
+  std::variant<int, double, char> v_no_value;
+  // The next variant has many types, meaning the type index does not fit in
+  // a byte and must be `unsigned short` instead of `unsigned char` when
+  // using the unstable libc++ ABI. With stable libc++ ABI, the type index
+  // is always just `unsigned int`.
+  std::variant<
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int, int, int, int,
+      int, int, int, int, int, int, int, int, int, int, int, int>
+      v_many_types_no_value;
+
+  v1 = 12; // v contains int
+  v_v1 = v1;
+  int i = std::get<int>(v1);
+  printf("%d\n", i); // break here
+
+  v2 = 2.0;
+  double d = std::get<double>(v2);
+  printf("%f\n", d);
+
+  v3 = 'A';
+  char c = std::get<char>(v3);
+  printf("%d\n", c);
+
+  // Checking v1 above and here to make sure we done maintain the incorrect
+  // state when we change its value.
+  v1 = 2.0;
+  d = std::get<double>(v1);
+  printf("%f\n", d); // break here
+
+  try {
+    v_no_value.emplace<0>(S());
+  } catch (...) {
+  }
+
+  printf("%zu\n", v_no_value.index());
+
+  try {
+    v_many_types_no_value.emplace<0>(S());
+  } catch (...) {
+  }
+
+  printf("%zu\n", v_many_types_no_value.index());
+
+  return 0; // break here
+}
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestPlatformAttach.py b/lldb/test/API/functionalities/gdb_remote_client/TestPlatformAttach.py
new file mode 100644
index 00000000000000..3aeb87874bdfa8
--- /dev/null
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestPlatformAttach.py
@@ -0,0 +1,58 @@
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase
+
+
+class TestPlatformAttach(GDBRemoteTestBase):
+    @skipIfRemote
+    @skipIfWindows
+    def test_attach(self):
+        """Test attaching by name"""
+
+        class MyPlatformResponder(MockGDBServerResponder):
+            def __init__(self, port):
+                MockGDBServerResponder.__init__(self)
+                self.port = port
+
+            def qLaunchGDBServer(self, _):
+                return "pid:1337;port:{};".format(self.port)
+
+            def qfProcessInfo(self, packet):
+                return "pid:95117;name:666f6f;"
+
+        class MyGDBResponder(MockGDBServerResponder):
+            def __init__(self):
+                MockGDBServerResponder.__init__(self)
+
+            def vAttach(self, _):
+                return "OK"
+
+        self.server.responder = MyGDBResponder()
+        port = self.server._socket._server_socket.getsockname()[1]
+
+        platform_socket = TCPServerSocket()
+        platform_server = MockGDBServer(platform_socket)
+        platform_server.responder = MyPlatformResponder(port)
+        platform_server.start()
+
+        error = lldb.SBError()
+        platform = lldb.SBPlatform("remote-linux")
+        self.dbg.SetSelectedPlatform(platform)
+
+        error = platform.ConnectRemote(
+            lldb.SBPlatformConnectOptions(platform_server.get_connect_url())
+        )
+        self.assertSuccess(error)
+        self.assertTrue(platform.IsConnected())
+
+        attach_info = lldb.SBAttachInfo()
+        attach_info.SetExecutable("foo")
+
+        target = lldb.SBTarget()
+        process = platform.Attach(attach_info, self.dbg, target, error)
+        self.assertSuccess(error)
+        self.assertEqual(process.GetProcessID(), 95117)
+
+        platform.DisconnectRemote()
diff --git a/lldb/test/API/lang/cpp/union-static-data-members/Makefile b/lldb/test/API/lang/cpp/union-static-data-members/Makefile
new file mode 100644
index 00000000000000..99998b20bcb050
--- /dev/null
+++ b/lldb/test/API/lang/cpp/union-static-data-members/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/lang/cpp/union-static-data-members/TestCppUnionStaticMembers.py b/lldb/test/API/lang/cpp/union-static-data-members/TestCppUnionStaticMembers.py
new file mode 100644
index 00000000000000..47166636b12647
--- /dev/null
+++ b/lldb/test/API/lang/cpp/union-static-data-members/TestCppUnionStaticMembers.py
@@ -0,0 +1,43 @@
+"""
+Tests that frame variable and expr work for
+C++ unions and their static data members.
+"""
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+import lldbsuite.test.lldbutil as lldbutil
+
+class CppUnionStaticMembersTestCase(TestBase):
+    def test(self):
+        """Tests that frame variable and expr work
+           for union static data members"""
+        self.build()
+
+        (target, process, main_thread, _) = lldbutil.run_to_source_breakpoint(
+            self, "return 0", lldb.SBFileSpec("main.cpp")
+        )                                                                     
+
+        self.expect("frame variable foo", substrs=["val = 42"])
+        self.expect("frame variable bar", substrs=["val = 137"])
+
+        self.expect_expr("foo", result_type="Foo", result_children=[ValueCheck(
+                name="val", value="42"
+            )])
+        self.expect_expr("bar", result_type="Bar", result_children=[ValueCheck(
+                name="val", value="137"
+            )])
+
+        self.expect_expr("Foo::sVal1", result_type="const int", result_value="-42")
+        self.expect_expr("Foo::sVal2", result_type="Foo", result_children=[ValueCheck(
+                name="val", value="42"
+            )])
+
+    @expectedFailureAll
+    def test_union_in_anon_namespace(self):
+        """Tests that frame variable and expr work
+           for union static data members in anonymous
+           namespaces"""
+        self.expect_expr("Bar::sVal1", result_type="const int", result_value="-137")
+        self.expect_expr("Bar::sVal2", result_type="Bar", result_children=[ValueCheck(
+                name="val", value="137"
+            )])
diff --git a/lldb/test/API/lang/cpp/union-static-data-members/main.cpp b/lldb/test/API/lang/cpp/union-static-data-members/main.cpp
new file mode 100644
index 00000000000000..8ba0312cd3a618
--- /dev/null
+++ b/lldb/test/API/lang/cpp/union-static-data-members/main.cpp
@@ -0,0 +1,25 @@
+union Foo {
+  int val = 42;
+  static const int sVal1 = -42;
+  static Foo sVal2;
+};
+
+Foo Foo::sVal2{};
+
+namespace {
+union Bar {
+  int val = 137;
+  static const int sVal1 = -137;
+  static Bar sVal2;
+};
+
+Bar Bar::sVal2{};
+} // namespace
+
+int main() {
+  Foo foo;
+  Bar bar;
+  auto sum = Bar::sVal1 + Foo::sVal1 + Foo::sVal2.val + Bar::sVal2.val;
+
+  return 0;
+}
diff --git a/lldb/tools/debugserver/source/CMakeLists.txt b/lldb/tools/debugserver/source/CMakeLists.txt
index 43accc363ef3cb..f0b9756becab6e 100644
--- a/lldb/tools/debugserver/source/CMakeLists.txt
+++ b/lldb/tools/debugserver/source/CMakeLists.txt
@@ -19,8 +19,8 @@ endfunction()
 
 function(get_debugserver_codesign_identity result)
   string(CONCAT not_found_help
-    "This will cause failures in the test suite."
-    "Pass '-DLLDB_USE_SYSTEM_DEBUGSERVER=ON' to use the system one instead."
+    "This will cause failures in the test suite. "
+    "Pass '-DLLDB_USE_SYSTEM_DEBUGSERVER=ON' to use the system one instead. "
     "See 'Code Signing on macOS' in the documentation."
   )
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 7b49ec161c3f3b..103c08ffbe83b3 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -6,7 +6,14 @@ set(LLVM_COMMON_CMAKE_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
 include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
   NO_POLICY_SCOPE)
 
-set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
+# Builds with custom install names and installation rpath setups may not work
+# in the build tree. Allow these cases to use CMake's default build tree
+# behavior by setting `LLVM_NO_INSTALL_NAME_DIR_FOR_BUILD_TREE` to do this.
+option(LLVM_NO_INSTALL_NAME_DIR_FOR_BUILD_TREE "If set, use CMake's default build tree install name directory logic (Darwin only)" OFF)
+mark_as_advanced(LLVM_NO_INSTALL_NAME_DIR_FOR_BUILD_TREE)
+if(NOT LLVM_NO_INSTALL_NAME_DIR_FOR_BUILD_TREE)
+  set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
+endif()
 
 if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 18)
@@ -149,10 +156,7 @@ endif()
 # As we migrate runtimes to using the bootstrapping build, the set of default runtimes
 # should grow as we remove those runtimes from LLVM_ENABLE_PROJECTS above.
 set(LLVM_DEFAULT_RUNTIMES "libcxx;libcxxabi;libunwind")
-if ("flang" IN_LIST LLVM_ENABLE_PROJECTS)
-  set(LLVM_DEFAULT_RUNTIMES "libcxx;libcxxabi;libunwind;flang-rt")
-endif()
-set(LLVM_SUPPORTED_RUNTIMES "libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp;llvm-libgcc;flang-rt")
+set(LLVM_SUPPORTED_RUNTIMES "libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp;llvm-libgcc")
 set(LLVM_ENABLE_RUNTIMES "" CACHE STRING
   "Semicolon-separated list of runtimes to build, or \"all\" (${LLVM_DEFAULT_RUNTIMES}). Supported runtimes are ${LLVM_SUPPORTED_RUNTIMES}.")
 if(LLVM_ENABLE_RUNTIMES STREQUAL "all")
@@ -174,11 +178,6 @@ if ("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
   endif()
 endif()
 
-if ("flang" IN_LIST LLVM_ENABLE_PROJECTS AND NOT "flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
-  message(STATUS "Enabling Flang-rt to be built with the Flang project.")
-  list(APPEND LLVM_ENABLE_RUNTIMES "flang-rt")
-endif()
-
 # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
 # `LLVM_ENABLE_PROJECTS` CMake cache variable.  This exists for
 # several reasons:
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index f72c41aaffb506..c5142c9e660fba 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -327,7 +327,10 @@ if( LLVM_USE_LINKER )
     CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   check_cxx_source_compiles("int main() { return 0; }" CXX_SUPPORTS_CUSTOM_LINKER)
   if ( NOT CXX_SUPPORTS_CUSTOM_LINKER )
-    message(FATAL_ERROR "Host compiler does not support '-fuse-ld=${LLVM_USE_LINKER}'")
+    message(FATAL_ERROR "Host compiler does not support '-fuse-ld=${LLVM_USE_LINKER}'. "
+                        "Please make sure that '${LLVM_USE_LINKER}' is installed and "
+                        "that your host compiler can compile a simple program when "
+                        "given the option '-fuse-ld=${LLVM_USE_LINKER}'.")
   endif()
 endif()
 
diff --git a/llvm/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.md b/llvm/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.md
index 1ff0433f80e9b3..88d9a9f8c4b07e 100644
--- a/llvm/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.md
+++ b/llvm/docs/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack.md
@@ -1,88 +1,10 @@
 # Allow Location Descriptions on the DWARF Expression Stack <!-- omit in toc -->
 
-- [1. Extension](#extension)
-- [2. Heterogeneous Computing Devices](#heterogeneous-computing-devices)
-- [3. DWARF 5](#dwarf-5)
-  - [3.1 How DWARF Maps Source Language To Hardware](#how-dwarf-maps-source-language-to-hardware)
-  - [3.2 Examples](#examples)
-    - [3.2.1 Dynamic Array Size](#dynamic-array-size)
-    - [3.2.2 Variable Location in Register](#variable-location-in-register)
-    - [3.2.3 Variable Location in Memory](#variable-location-in-memory)
-    - [3.2.4 Variable Spread Across Different Locations](#variable-spread-across-different-locations)
-    - [3.2.5 Offsetting a Composite Location](#offsetting-a-composite-location)
-    - [3.2.6 Pointer to Member](#pointer-to-member)
-    - [3.2.7 Virtual Base Class](#virtual-base-class)
-  - [3.3 Limitations](#limitations)
-- [4. Extension Solution](#extension-solution)
-  - [4.1 Location Description](#location-description)
-  - [4.2 Stack Location Description Operations](#stack-location-description-operations)
-  - [4.3 Examples](#examples-1)
-    - [4.3.1 Source Language Variable Spilled to Part of a Vector Register](#source-language-variable-spilled-to-part-of-a-vector-register)
-    - [4.3.2 Source Language Variable Spread Across Multiple Vector Registers](#source-language-variable-spread-across-multiple-vector-registers)
-    - [4.3.3 Source Language Variable Spread Across Multiple Kinds of Locations](#source-language-variable-spread-across-multiple-kinds-of-locations)
-    - [4.3.4 Address Spaces](#address-spaces)
-    - [4.3.5 Bit Offsets](#bit-offsets)
-  - [4.4 Call Frame Information (CFI)](#call-frame-information-cfi)
-  - [4.5 Objects Not In Byte Aligned Global Memory](#objects-not-in-byte-aligned-global-memory)
-  - [4.6 Higher Order Operations](#higher-order-operations)
-  - [4.7 Objects In Multiple Places](#objects-in-multiple-places)
-- [5. Conclusion](#conclusion)
-- [A. Changes to DWARF Debugging Information Format Version 5](#a-changes-to-dwarf-debugging-information-format-version-5)
-  - [A.2 General Description](#a-2-general-description)
-    - [A.2.5 DWARF Expressions](#a-2-5-dwarf-expressions)
-      - [A.2.5.1 DWARF Expression Evaluation Context](#a-2-5-1-dwarf-expression-evaluation-context)
-      - [A.2.5.2 DWARF Expression Value](#a-2-5-2-dwarf-expression-value)
-      - [A.2.5.3 DWARF Location Description](#a-2-5-3-dwarf-location-description)
-      - [A.2.5.4 DWARF Operation Expressions](#a-2-5-4-dwarf-operation-expressions)
-        - [A.2.5.4.1 Stack Operations](#a-2-5-4-1-stack-operations)
-        - [A.2.5.4.2 Control Flow Operations](#a-2-5-4-2-control-flow-operations)
-        - [A.2.5.4.3 Value Operations](#a-2-5-4-3-value-operations)
-          - [A.2.5.4.3.1 Literal Operations](#a-2-5-4-3-1-literal-operations)
-          - [A.2.5.4.3.2 Arithmetic and Logical Operations](#a-2-5-4-3-2-arithmetic-and-logical-operations)
-          - [A.2.5.4.3.3 Type Conversion Operations](#a-2-5-4-3-3-type-conversion-operations)
-          - [A.2.5.4.3.4 Special Value Operations](#a-2-5-4-3-4-special-value-operations)
-        - [A.2.5.4.4 Location Description Operations](#a-2-5-4-4-location-description-operations)
-          - [A.2.5.4.4.1 General Location Description Operations](#a-2-5-4-4-1-general-location-description-operations)
-          - [A.2.5.4.4.2 Undefined Location Description Operations](#a-2-5-4-4-2-undefined-location-description-operations)
-          - [A.2.5.4.4.3 Memory Location Description Operations](#a-2-5-4-4-3-memory-location-description-operations)
-          - [A.2.5.4.4.4 Register Location Description Operations](#a-2-5-4-4-4-register-location-description-operations)
-          - [A.2.5.4.4.5 Implicit Location Description Operations](#a-2-5-4-4-5-implicit-location-description-operations)
-          - [A.2.5.4.4.6 Composite Location Description Operations](#a-2-5-4-4-6-composite-location-description-operations)
-      - [A.2.5.5 DWARF Location List Expressions](#a-2-5-5-dwarf-location-list-expressions)
-  - [A.3 Program Scope Entries](#a-3-program-scope-entries)
-    - [A.3.3 Subroutine and Entry Point Entries](#a-3-3-subroutine-and-entry-point-entries)
-      - [A.3.3.5 Low-Level Information](#a-3-3-5-low-level-information)
-    - [A.3.4 Call Site Entries and Parameters](#a-3-4-call-site-entries-and-parameters)
-      - [A.3.4.2 Call Site Parameters](#a-3-4-2-call-site-parameters)
-    - [A.3.5 Lexical Block Entries](#a-3-5-lexical-block-entries)
-  - [A.4 Data Object and Object List Entries](#a-4-data-object-and-object-list-entries)
-    - [A.4.1 Data Object Entries](#a-4-1-data-object-entries)
-  - [A.5 Type Entries](#a-5-type-entries)
-    - [A.5.7 Structure, Union, Class and Interface Type Entries](#a-5-7-structure-union-class-and-interface-type-entries)
-      - [A.5.7.3 Derived or Extended Structures, Classes and Interfaces](#a-5-7-3-derived-or-extended-structures-classes-and-interfaces)
-      - [A.5.7.8 Member Function Entries](#a-5-7-8-member-function-entries)
-    - [A.5.14 Pointer to Member Type Entries](#a-5-14-pointer-to-member-type-entries)
-    - [A.5.16 Dynamic Type Entries](#a-5-16-dynamic-type-entries)
-  - [A.6 Other Debugging Information](#a-6-other-debugging-information)
-    - [A.6.2 Line Number Information](#a-6-2-line-number-information)
-    - [A.6.4 Call Frame Information](#a-6-4-call-frame-information)
-      - [A.6.4.1 Structure of Call Frame Information](#a-6-4-1-structure-of-call-frame-information)
-      - [A.6.4.2 Call Frame Instructions](#a-6-4-2-call-frame-instructions)
-        - [A.6.4.2.1 Row Creation Instructions](#a-6-4-2-1-row-creation-instructions)
-        - [A.6.4.2.2 CFA Definition Instructions](#a-6-4-2-2-cfa-definition-instructions)
-        - [A.6.4.2.3 Register Rule Instructions](#a-6-4-2-3-register-rule-instructions)
-        - [A.6.4.2.4 Row State Instructions](#a-6-4-2-4-row-state-instructions)
-        - [A.6.4.2.5 Padding Instruction](#a-6-4-2-5-padding-instruction)
-      - [A.6.4.3 Call Frame Instruction Usage](#a-6-4-3-call-frame-instruction-usage)
-      - [A.6.4.4 Call Frame Calling Address](#a-6-4-4-call-frame-calling-address)
-  - [A.7 Data Representation](#a-7-data-representation)
-    - [A.7.4 32-Bit and 64-Bit DWARF Formats](#a-7-4-32-bit-and-64-bit-dwarf-formats)
-    - [A.7.5 Format of Debugging Information](#a-7-5-format-of-debugging-information)
-      - [A.7.5.5 Classes and Forms](#a-7-5-5-classes-and-forms)
-    - [A.7.7 DWARF Expressions](#a-7-7-dwarf-expressions)
-      - [A.7.7.1 Operation Expressions](#a-7-7-1-operation-expressions)
-      - [A.7.7.3 Location List Expressions](#a-7-7-3-location-list-expressions)
-- [B. Further Information](#b-further-information)
+```{contents}
+---
+local:
+---
+```
 
 # 1. Extension
 
@@ -111,20 +33,18 @@ specialized context sensitive operations are harder for both producers and
 consumers than a smaller number of general composable operations that have
 consistent semantics regardless of context.
 
-First, section [2. Heterogeneous Computing
-Devices](#heterogeneous-computing-devices) describes heterogeneous devices and
-the features they have that are not addressed by DWARF 5. Then section [3. DWARF
-5](#dwarf-5) presents a brief simplified overview of the DWARF 5 expression
+First, section [2. Heterogeneous Computing Devices](#heterogeneous-computing-devices)
+describes heterogeneous devices and the features they have that are not addressed by DWARF 5.
+Then section [3. DWARF5](#dwarf-5) presents a brief simplified overview of the DWARF 5 expression
 evaluation model that highlights the difficulties for supporting the
 heterogeneous features. Next, section [4. Extension
 Solution](#extension-solution) provides an overview of the proposal, using
 simplified examples to illustrate how it can address the issues of heterogeneous
 devices and also benefit non-heterogeneous devices. Then overall conclusions are
-covered in section [5. Conclusion](#conclusion). Appendix [A. Changes to DWARF
-Debugging Information Format Version
-5](#a-changes-to-dwarf-debugging-information-format-version-5) gives changes
-relative to the DWARF Version 5 standard. Finally, appendix [B. Further
-Information](#b-further-information) has references to further information.
+covered in section [5. Conclusion](#conclusion).
+Appendix [A. Changes to DWARF Debugging Information Format Version 5](#changes-to-dwarf-debugging-information-format-version-5) gives changes
+relative to the DWARF Version 5 standard. Finally, appendix
+[B. Further Information](#further-information) has references to further information.
 
 # 2. Heterogeneous Computing Devices
 
@@ -625,7 +545,7 @@ Address requested for identifier "x" which is in register $rdi
 
 With location descriptions on the stack, the definition of `DW_OP_use_location`
 can be modified by replacing every instance of "address" with "location
-description", as is described in [A.5 Type Entries](#a-5-type-entries).
+description", as is described in [Type Entries](#type-entries).
 
 To implement the fully generalized version of this attribute, GCC would only
 need to change the expression from `DW_OP_plus` to `DW_OP_swap,
@@ -811,8 +731,7 @@ $2 = {<A> = <invalid address>, _vptr.B = <optimized out>}
 
 With location descriptions on the stack, the definition of
 `DW_OP_data_member_location` can be modified by replacing every instance of
-"address" with "location description", as is described in [A.5 Type
-Entries](#a-5-type-entries).
+"address" with "location description", as is described in [A.5 Type Entries](#type-entries).
 
 To implement the fully generalized version of this attribute, GCC would only
 need to change the last operation in the expression from `DW_OP_plus` to
@@ -1406,10 +1325,9 @@ is evaluated, it may be specified whether a value or location description is
 required as the result kind.
 
 If a result kind is specified, and the result of the evaluation does not match
-the specified result kind, then the implicit conversions described in [2.5.4.4.3
-Memory Location Description
-Operations](#memory-location-description-operations) are performed if
-valid. Otherwise, the DWARF expression is ill-formed.
+the specified result kind, then the implicit conversions described in
+[2.5.4.4.3 Memory Location Description Operations](#memory-location-description-operations)
+are performed if valid. Otherwise, the DWARF expression is ill-formed.
 
 If the evaluation of a DWARF expression encounters an evaluation error, then the
 result is an evaluation error.
@@ -1794,9 +1712,9 @@ Operations represent a postfix operation on a simple stack machine. Each stack
 entry can hold either a value or a location description. Operations can act on
 entries on the stack, including adding entries and removing entries. If the kind
 of a stack entry does not match the kind required by the operation and is not
-implicitly convertible to the required kind (see [2.5.4.4.3 Memory Location
-Description Operations](#memory-location-description-operations)), then
-the DWARF operation expression is ill-formed.
+implicitly convertible to the required kind
+(see [2.5.4.4.3 Memory Location Description Operations](#memory-location-description-operations)),
+then the DWARF operation expression is ill-formed.
 
 Evaluation of an operation expression starts with an empty stack on which the
 entries from the initial stack provided by the context are pushed in the order
@@ -1817,9 +1735,8 @@ The result of the evaluation is:
     an empty operation expression for this purpose.</i>
 
   - If the top stack entry is a location description, or can be converted to one
-    (see [2.5.4.4.3 Memory Location Description
-    Operations](#memory-location-description-operations)), then the result
-    is that, possibly converted, location description. Any other entries on the
+    (see [2.5.4.4.3 Memory Location Description Operations](#memory-location-description-operations)),
+    then the result is that, possibly converted, location description. Any other entries on the
     stack are discarded.
   - Otherwise the DWARF expression is ill-formed.
 
@@ -1828,9 +1745,8 @@ The result of the evaluation is:
 
 - If the current result kind specifies a value, then:
   - If the top stack entry is a value, or can be converted to one (see
-    [2.5.4.4.3 Memory Location Description
-    Operations](#memory-location-description-operations)), then the result is
-    that, possibly converted, value. Any other entries on the stack are
+    [2.5.4.4.3 Memory Location Description Operations](#memory-location-description-operations)),
+    then the result is that, possibly converted, value. Any other entries on the stack are
     discarded.
   - Otherwise the DWARF expression is ill-formed.
 - If the current result kind is not specified, then:
@@ -1867,9 +1783,8 @@ stack assume that the top of the stack (most recently added entry) has index 0.
 They allow the stack entries to be either a value or location description.
 
 If any stack entry accessed by a stack operation is an incomplete composite
-location description (see [2.5.4.4.6 Composite Location Description
-Operations](#composite-location-description-operations)), then the DWARF
-expression is ill-formed.
+location description (see [2.5.4.4.6 Composite Location Description Operations]
+(#composite-location-description-operations)), then the DWARF expression is ill-formed.
 
 > NOTE: These operations now support stack entries that are values and location
 > descriptions.
@@ -2346,9 +2261,8 @@ There are these special value operations currently defined:
     undefined location storage or the offset of any bit exceeds the size of the
     location storage LS specified by any single location description SL of L.
 
-    See [2.5.4.4.5 Implicit Location Description
-    Operations](#implicit-location-description-operations) for special
-    rules concerning implicit location descriptions created by the
+    See [2.5.4.4.5 Implicit Location Description Operations](#implicit-location-description-operations)
+    for special rules concerning implicit location descriptions created by the
     `DW_OP_implicit_pointer` operation.
 
 5.  `DW_OP_xderef`
@@ -2606,8 +2520,8 @@ bit offset equal to V scaled by 8 (the byte size).
 If a stack entry is required to be a location description, but it is an implicit
 pointer value IPV with the target architecture default address space, then it is
 implicitly converted to a location description with one single location
-description specified by IPV. See [2.5.4.4.5 Implicit Location Description
-Operations](#implicit-location-description-operations).
+description specified by IPV. See
+[2.5.4.4.5 Implicit Location Description Operations](#implicit-location-description-operations).
 
 If a stack entry is required to be a value, but it is a location description L
 with one memory location description SL in the target architecture default
@@ -3699,8 +3613,7 @@ Frame Description Entries (FDE). There is at least one CIE in every non-empty
     value of length must be an integral multiple of the address size specified
     in the `address_size` field.
 
-2.  `CIE_id` (4 or 8 bytes, see [7.4 32-Bit and 64-Bit DWARF
-    Formats](#32-bit-and-64-bit-dwarf-formats))
+2.  `CIE_id` (4 or 8 bytes, see [7.4 32-Bit and 64-Bit DWARF Formats](#bit-and-64-bit-dwarf-formats))
 
     A constant that is used to distinguish CIEs from FDEs.
 
@@ -3796,8 +3709,7 @@ An FDE contains the following fields, in order:
     of the length field plus the value of length must be an integral multiple of
     the address size.
 
-2.  `CIE_pointer` (4 or 8 bytes, see [7.4 32-Bit and 64-Bit DWARF
-    Formats](#32-bit-and-64-bit-dwarf-formats))
+2.  `CIE_pointer` (4 or 8 bytes, see [7.4 32-Bit and 64-Bit DWARF Formats](#bit-and-64-bit-dwarf-formats))
 
     A constant offset into the `.debug_frame` section that denotes the CIE that
     is associated with this FDE.
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 668c5e7092e1c3..c70b6b8206c2f9 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -733,6 +733,13 @@ enabled sub-projects. Nearly all of these variable names begin with
   directory contains executables with the expected names, no separate
   native versions of those executables will be built.
 
+**LLVM_NO_INSTALL_NAME_DIR_FOR_BUILD_TREE**:BOOL
+  Defaults to ``OFF``. If set to ``ON``, CMake's default logic for library IDs
+  on Darwin in the build tree will be used. Otherwise the install-time library
+  IDs will be used in the build tree as well. Mainly useful when other CMake
+  library ID control variables (e.g., ``CMAKE_INSTALL_NAME_DIR``) are being
+  set to non-standard values.
+
 **LLVM_OPTIMIZED_TABLEGEN**:BOOL
   If enabled and building a debug or asserts build the CMake build system will
   generate a Release build tree to build a fully optimized tablegen for use
diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index 79f1479716faef..67fea9520aab6d 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -50,6 +50,9 @@ Create a local branch per commit you want to submit and then push that branch
 to your `fork <https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks>`_
 of the llvm-project and
 `create a pull request from the fork <https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork>`_.
+As GitHub uses the first line of the commit message truncated to 72 characters
+as the pull request title, you may have to edit to reword or to undo this
+truncation.
 
 Creating Pull Requests with GitHub CLI
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c8f74c19bd6b3c..9dd635e1850c00 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -3394,17 +3394,82 @@ Floating-Point Environment
 The default LLVM floating-point environment assumes that traps are disabled and
 status flags are not observable. Therefore, floating-point math operations do
 not have side effects and may be speculated freely. Results assume the
-round-to-nearest rounding mode.
+round-to-nearest rounding mode, and subnormals are assumed to be preserved.
+
+Running LLVM code in an environment where these assumptions are not met can lead
+to undefined behavior. The ``strictfp`` and ``denormal-fp-math`` attributes as
+well as :ref:`Constrained Floating-Point Intrinsics <constrainedfp>` can be used
+to weaken LLVM's assumptions and ensure defined behavior in non-default
+floating-point environments; see their respective documentation for details.
+
+.. _floatnan:
+
+Behavior of Floating-Point NaN values
+-------------------------------------
+
+A floating-point NaN value consists of a sign bit, a quiet/signaling bit, and a
+payload (which makes up the rest of the mantissa except for the quiet/signaling
+bit). LLVM assumes that the quiet/signaling bit being set to ``1`` indicates a
+quiet NaN (QNaN), and a value of ``0`` indicates a signaling NaN (SNaN). In the
+following we will hence just call it the "quiet bit"
+
+The representation bits of a floating-point value do not mutate arbitrarily; in
+particular, if there is no floating-point operation being performed, NaN signs,
+quiet bits, and payloads are preserved.
+
+For the purpose of this section, ``bitcast`` as well as the following operations
+are not "floating-point math operations": ``fneg``, ``llvm.fabs``, and
+``llvm.copysign``. These operations act directly on the underlying bit
+representation and never change anything except possibly for the sign bit.
+
+For floating-point math operations, unless specified otherwise, the following
+rules apply when a NaN value is returned: the result has a non-deterministic
+sign; the quiet bit and payload are non-deterministically chosen from the
+following set of options:
+
+- The quiet bit is set and the payload is all-zero. ("Preferred NaN" case)
+- The quiet bit is set and the payload is copied from any input operand that is
+  a NaN. ("Quieting NaN propagation" case)
+- The quiet bit and payload are copied from any input operand that is a NaN.
+  ("Unchanged NaN propagation" case)
+- The quiet bit is set and the payload is picked from a target-specific set of
+  "extra" possible NaN payloads. The set can depend on the input operand values.
+  This set is empty on x86 and ARM, but can be non-empty on other architectures.
+  (For instance, on wasm, if any input NaN does not have the preferred all-zero
+  payload or any input NaN is an SNaN, then this set contains all possible
+  payloads; otherwise, it is empty. On SPARC, this set consists of the all-one
+  payload.)
+
+In particular, if all input NaNs are quiet (or if there are no input NaNs), then
+the output NaN is definitely quiet. Signaling NaN outputs can only occur if they
+are provided as an input value. For example, "fmul SNaN, 1.0" may be simplified
+to SNaN rather than QNaN. Similarly, if all input NaNs are preferred (or if
+there are no input NaNs) and the target does not have any "extra" NaN payloads,
+then the output NaN is guaranteed to be preferred.
 
 Floating-point math operations are allowed to treat all NaNs as if they were
-quiet NaNs. For example, "pow(1.0, SNaN)" may be simplified to 1.0. This also
-means that SNaN may be passed through a math operation without quieting. For
-example, "fmul SNaN, 1.0" may be simplified to SNaN rather than QNaN. However,
-SNaN values are never created by math operations. They may only occur when
-provided as a program input value.
+quiet NaNs. For example, "pow(1.0, SNaN)" may be simplified to 1.0.
 
 Code that requires different behavior than this should use the
 :ref:`Constrained Floating-Point Intrinsics <constrainedfp>`.
+In particular, constrained intrinsics rule out the "Unchanged NaN propagation"
+case; they are guaranteed to return a QNaN.
+
+Unfortunately, due to hard-or-impossible-to-fix issues, LLVM violates its own
+specification on some architectures:
+
+- x86-32 without SSE2 enabled may convert floating-point values to x86_fp80 and
+  back when performing floating-point math operations; this can lead to results
+  with different precision than expected and it can alter NaN values. Since
+  optimizations can make contradicting assumptions, this can lead to arbitrary
+  miscompilations. See `issue #44218
+  <https://github.com/llvm/llvm-project/issues/44218>`_.
+- x86-32 (even with SSE2 enabled) may implicitly perform such a conversion on
+  values returned from a function for some calling conventions. See `issue
+  #66803 <https://github.com/llvm/llvm-project/issues/66803>`_.
+- Older MIPS versions use the opposite polarity for the quiet/signaling bit, and
+  LLVM does not correctly represent this. See `issue #60796
+  <https://github.com/llvm/llvm-project/issues/60796>`_.
 
 .. _fastmath:
 
@@ -9085,6 +9150,9 @@ Semantics:
 """"""""""
 
 The value produced is a copy of the operand with its sign bit flipped.
+The value is otherwise completely identical; in particular, if the input is a
+NaN, then the quiet/signaling bit and payload are perfectly preserved.
+
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -11240,6 +11308,11 @@ The '``fptrunc``' instruction casts a ``value`` from a larger
 This instruction is assumed to execute in the default :ref:`floating-point
 environment <floatenv>`.
 
+NaN values follow the usual :ref:`NaN behaviors <floatnan>`, except that _if_ a
+NaN payload is propagated from the input ("Quieting NaN propagation" or
+"Unchanged NaN propagation" cases), then the low order bits of the NaN payload
+which cannot fit in the resulting type are discarded.
+
 Example:
 """"""""
 
@@ -11280,6 +11353,11 @@ The '``fpext``' instruction extends the ``value`` from a smaller
 *no-op cast* because it always changes bits. Use ``bitcast`` to make a
 *no-op cast* for a floating-point cast.
 
+NaN values follow the usual :ref:`NaN behaviors <floatnan>`, except that _if_ a
+NaN payload is propagated from the input ("Quieting NaN propagation" or
+"Unchanged NaN propagation" cases), then it is copied to the high order bits of
+the resulting payload, and the remaining low order bits are zero.
+
 Example:
 """"""""
 
@@ -15092,6 +15170,9 @@ Semantics:
 
 This function returns the same values as the libm ``fabs`` functions
 would, and handles error conditions in the same way.
+The returned value is completely identical to the input except for the sign bit;
+in particular, if the input is a NaN, then the quiet/signaling bit and payload
+are perfectly preserved.
 
 .. _i_minnum:
 
@@ -15307,6 +15388,9 @@ Semantics:
 
 This function returns the same values as the libm ``copysign``
 functions would, and handles error conditions in the same way.
+The returned value is completely identical to the first operand except for the
+sign bit; in particular, if the input is a NaN, then the quiet/signaling bit and
+payload are perfectly preserved.
 
 .. _int_floor:
 
diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md
index 02e3b026a8a3b2..c5f195faaa459d 100644
--- a/llvm/docs/PointerAuth.md
+++ b/llvm/docs/PointerAuth.md
@@ -117,7 +117,7 @@ It returns a raw pointer value.  It does **not** check that the
 signature is valid.
 
 `key` should identify a key that is appropriate for `value`, as defined
-by the target-specific [keys](#key)).
+by the target-specific [keys](#keys)).
 
 If `value` is a raw pointer value, it is returned as-is (provided the `key`
 is appropriate for the pointer).
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 8d12d58738c609..6812efaeb36e0c 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -109,6 +109,7 @@ on support follow.
      ``Zcmp``         Assembly Support
      ``Zcmt``         Assembly Support
      ``Zdinx``        Supported
+     ``Zfa``          Supported
      ``Zfh``          Supported
      ``Zfhmin``       Supported
      ``Zfinx``        Supported
@@ -196,9 +197,6 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zacas``
   LLVM implements the `1.0-rc1 draft specification <https://github.com/riscv/riscv-zacas/releases/tag/v1.0-rc1>`_.
 
-``experimental-zfa``
-  LLVM implements the `0.2 draft specification <https://github.com/riscv/riscv-isa-manual/releases/download/draft-20230131-c0b298a/zfa-20230414.pdf>`__.
-
 ``experimental-zfbfmin``, ``experimental-zvfbfmin``, ``experimental-zvfbfwma``
   LLVM implements assembler support for the `0.8.0 draft specification <https://github.com/riscv/riscv-bfloat16/releases/tag/20230629>`_.
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 660bb4e70a5a70..f0d4b5c5dfc7af 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -109,6 +109,7 @@ Changes to the PowerPC Backend
 Changes to the RISC-V Backend
 -----------------------------
 
+* The Zfa extension version was upgraded to 1.0 and is no longer experimental.
 * Zihintntl extension version was upgraded to 1.0 and is no longer experimental.
 
 Changes to the WebAssembly Backend
diff --git a/llvm/docs/SpeculativeLoadHardening.md b/llvm/docs/SpeculativeLoadHardening.md
index 50b9ea39a42a39..0b84efb960cd09 100644
--- a/llvm/docs/SpeculativeLoadHardening.md
+++ b/llvm/docs/SpeculativeLoadHardening.md
@@ -1,6 +1,6 @@
 # Speculative Load Hardening
 
-### A Spectre Variant #1 Mitigation Technique
+## A Spectre Variant #1 Mitigation Technique
 
 Author: Chandler Carruth - [chandlerc@google.com](mailto:chandlerc@google.com)
 
diff --git a/llvm/docs/conf.py b/llvm/docs/conf.py
index cf8a75980b5303..0a3905201c8592 100644
--- a/llvm/docs/conf.py
+++ b/llvm/docs/conf.py
@@ -17,7 +17,7 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-# sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, os.path.abspath("."))
 
 # -- General configuration -----------------------------------------------------
 
@@ -28,6 +28,13 @@
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = ["myst_parser", "sphinx.ext.intersphinx", "sphinx.ext.todo"]
 
+# Automatic anchors for markdown titles
+from llvm_slug import make_slug
+
+myst_heading_anchors = 6
+myst_heading_slug_func = make_slug
+
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
diff --git a/llvm/docs/llvm_slug.py b/llvm/docs/llvm_slug.py
new file mode 100644
index 00000000000000..9dd8b1cd3f1aac
--- /dev/null
+++ b/llvm/docs/llvm_slug.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+#
+# LLVM documentation anchor slug formatting
+
+# Some of our markdown documentation numbers section titles
+# This helpers is used by myst to remove that numbering from the anchor links.
+
+from docutils.nodes import make_id
+
+
+def make_slug(str):
+    import re
+
+    str = re.sub(r"^\s*(\w\.)+\w\s", "", str)
+    str = re.sub(r"^\s*\w\.\s", "", str)
+    return make_id(str)
diff --git a/llvm/docs/requirements.txt b/llvm/docs/requirements.txt
index 4d08973d255859..d66569c1b3e79e 100644
--- a/llvm/docs/requirements.txt
+++ b/llvm/docs/requirements.txt
@@ -1,4 +1,9 @@
-sphinx
-myst-parser
-sphinx-automodapi
-furo
+sphinx==7.1.2
+docutils==0.20.1
+sphinx-markdown-tables==0.0.17
+recommonmark==0.7.1
+sphinx-automodapi==0.16.0
+sphinx-bootstrap-theme==0.8.1
+sphinxcontrib-applehelp==1.0.4
+furo==2023.8.19
+myst-parser==2.0.0
diff --git a/llvm/include/llvm/ADT/PagedVector.h b/llvm/include/llvm/ADT/PagedVector.h
index 667bece6d71838..3fcca6d82cb33a 100644
--- a/llvm/include/llvm/ADT/PagedVector.h
+++ b/llvm/include/llvm/ADT/PagedVector.h
@@ -209,35 +209,33 @@ template <typename T, size_t PageSize = 1024 / sizeof(T)> class PagedVector {
       return PagePtr[ElementIdx % PageSize];
     }
 
-    friend bool operator==(MaterializedIterator const &LHS,
-                           MaterializedIterator const &RHS);
-    friend bool operator!=(MaterializedIterator const &LHS,
-                           MaterializedIterator const &RHS);
+    /// Equality operator.
+    friend bool operator==(const MaterializedIterator &LHS,
+                           const MaterializedIterator &RHS) {
+      return LHS.equals(RHS);
+    }
 
     [[nodiscard]] size_t getIndex() const { return ElementIdx; }
-  };
 
-  /// Equality operator.
-  friend bool operator==(MaterializedIterator const &LHS,
-                         MaterializedIterator const &RHS) {
-    assert(LHS.PV == RHS.PV);
-    // Make sure we are comparing either end iterators or iterators pointing
-    // to materialized elements.
-    // It should not be possible to build two iterators pointing to non
-    // materialized elements.
-    assert(LHS.ElementIdx == LHS.PV->Size ||
-           (LHS.ElementIdx < LHS.PV->Size &&
-            LHS.PV->PageToDataPtrs[LHS.ElementIdx / PageSize]));
-    assert(RHS.ElementIdx == RHS.PV->Size ||
-           (RHS.ElementIdx < RHS.PV->Size &&
-            RHS.PV->PageToDataPtrs[RHS.ElementIdx / PageSize]));
-    return LHS.ElementIdx == RHS.ElementIdx;
-  }
+    friend bool operator!=(const MaterializedIterator &LHS,
+                           const MaterializedIterator &RHS) {
+      return !(LHS == RHS);
+    }
 
-  friend bool operator!=(MaterializedIterator const &LHS,
-                         MaterializedIterator const &RHS) {
-    return !(LHS == RHS);
-  }
+  private:
+    void verify() const {
+      assert(
+          ElementIdx == PV->Size ||
+          (ElementIdx < PV->Size && PV->PageToDataPtrs[ElementIdx / PageSize]));
+    }
+
+    bool equals(const MaterializedIterator &Other) const {
+      assert(PV == Other.PV);
+      verify();
+      Other.verify();
+      return ElementIdx == Other.ElementIdx;
+    }
+  };
 
   /// Iterators over the materialized elements of the vector.
   ///
diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index 2840c5f608d3ea..0a228b2204d6fc 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -27,6 +27,31 @@
 #include <cstdlib>  // for _byteswap_{ushort,ulong,uint64}
 #endif
 
+#if defined(__linux__) || defined(__GNU__) || defined(__HAIKU__) ||            \
+    defined(__Fuchsia__) || defined(__EMSCRIPTEN__)
+#include <endian.h>
+#elif defined(_AIX)
+#include <sys/machine.h>
+#elif defined(__sun)
+/* Solaris provides _BIG_ENDIAN/_LITTLE_ENDIAN selector in sys/types.h */
+#include <sys/types.h>
+#define BIG_ENDIAN 4321
+#define LITTLE_ENDIAN 1234
+#if defined(_BIG_ENDIAN)
+#define BYTE_ORDER BIG_ENDIAN
+#else
+#define BYTE_ORDER LITTLE_ENDIAN
+#endif
+#elif defined(__MVS__)
+#define BIG_ENDIAN 4321
+#define LITTLE_ENDIAN 1234
+#define BYTE_ORDER BIG_ENDIAN
+#else
+#if !defined(BYTE_ORDER) && !defined(_WIN32)
+#include <machine/endian.h>
+#endif
+#endif
+
 #ifdef _MSC_VER
 // Declare these intrinsics manually rather including intrin.h. It's very
 // expensive, and bit.h is popular via MathExtras.h.
@@ -41,6 +66,16 @@ unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
 
 namespace llvm {
 
+enum class endianness {
+  big,
+  little,
+#if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
+  native = big
+#else
+  native = little
+#endif
+};
+
 // This implementation of bit_cast is different from the C++20 one in two ways:
 //  - It isn't constexpr because that requires compiler support.
 //  - It requires trivially-constructible To, to avoid UB in the implementation.
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfo.h b/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
index 39507570a1b2c3..95d75b0e1854fd 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfo.h
@@ -16,6 +16,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/Printable.h"
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -73,34 +74,26 @@ class BlockFrequencyInfo {
   /// Returns the estimated profile count of \p Freq.
   /// This uses the frequency \p Freq and multiplies it by
   /// the enclosing function's count (if available) and returns the value.
-  std::optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
+  std::optional<uint64_t> getProfileCountFromFreq(BlockFrequency Freq) const;
 
   /// Returns true if \p BB is an irreducible loop header
   /// block. Otherwise false.
   bool isIrrLoopHeader(const BasicBlock *BB);
 
   // Set the frequency of the given basic block.
-  void setBlockFreq(const BasicBlock *BB, uint64_t Freq);
+  void setBlockFreq(const BasicBlock *BB, BlockFrequency Freq);
 
   /// Set the frequency of \p ReferenceBB to \p Freq and scale the frequencies
   /// of the blocks in \p BlocksToScale such that their frequencies relative
   /// to \p ReferenceBB remain unchanged.
-  void setBlockFreqAndScale(const BasicBlock *ReferenceBB, uint64_t Freq,
+  void setBlockFreqAndScale(const BasicBlock *ReferenceBB, BlockFrequency Freq,
                             SmallPtrSetImpl<BasicBlock *> &BlocksToScale);
 
   /// calculate - compute block frequency info for the given function.
   void calculate(const Function &F, const BranchProbabilityInfo &BPI,
                  const LoopInfo &LI);
 
-  // Print the block frequency Freq to OS using the current functions entry
-  // frequency to convert freq into a relative decimal form.
-  raw_ostream &printBlockFreq(raw_ostream &OS, const BlockFrequency Freq) const;
-
-  // Convenience method that attempts to look up the frequency associated with
-  // BB and print it to OS.
-  raw_ostream &printBlockFreq(raw_ostream &OS, const BasicBlock *BB) const;
-
-  uint64_t getEntryFreq() const;
+  BlockFrequency getEntryFreq() const;
   void releaseMemory();
   void print(raw_ostream &OS) const;
 
@@ -108,6 +101,15 @@ class BlockFrequencyInfo {
   void verifyMatch(BlockFrequencyInfo &Other) const;
 };
 
+/// Print the block frequency @p Freq relative to the current functions entry
+/// frequency. Returns a Printable object that can be piped via `<<` to a
+/// `raw_ostream`.
+Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq);
+
+/// Convenience function equivalent to calling
+/// `printBlockFreq(BFI, BFI.getBlocakFreq(&BB))`.
+Printable printBlockFreq(const BlockFrequencyInfo &BFI, const BasicBlock &BB);
+
 /// Analysis pass which computes \c BlockFrequencyInfo.
 class BlockFrequencyAnalysis
     : public AnalysisInfoMixin<BlockFrequencyAnalysis> {
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 54d56f8472c2bc..0c05be0b9b8cdf 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -527,22 +527,21 @@ class BlockFrequencyInfoImplBase {
   getBlockProfileCount(const Function &F, const BlockNode &Node,
                        bool AllowSynthetic = false) const;
   std::optional<uint64_t>
-  getProfileCountFromFreq(const Function &F, uint64_t Freq,
+  getProfileCountFromFreq(const Function &F, BlockFrequency Freq,
                           bool AllowSynthetic = false) const;
   bool isIrrLoopHeader(const BlockNode &Node);
 
-  void setBlockFreq(const BlockNode &Node, uint64_t Freq);
+  void setBlockFreq(const BlockNode &Node, BlockFrequency Freq);
 
-  raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
-  raw_ostream &printBlockFreq(raw_ostream &OS,
-                              const BlockFrequency &Freq) const;
-
-  uint64_t getEntryFreq() const {
+  BlockFrequency getEntryFreq() const {
     assert(!Freqs.empty());
-    return Freqs[0].Integer;
+    return BlockFrequency(Freqs[0].Integer);
   }
 };
 
+void printBlockFreqImpl(raw_ostream &OS, BlockFrequency EntryFreq,
+                        BlockFrequency Freq);
+
 namespace bfi_detail {
 
 template <class BlockT> struct TypeMap {};
@@ -1029,7 +1028,7 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   }
 
   std::optional<uint64_t>
-  getProfileCountFromFreq(const Function &F, uint64_t Freq,
+  getProfileCountFromFreq(const Function &F, BlockFrequency Freq,
                           bool AllowSynthetic = false) const {
     return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq,
                                                                AllowSynthetic);
@@ -1039,7 +1038,7 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
     return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
   }
 
-  void setBlockFreq(const BlockT *BB, uint64_t Freq);
+  void setBlockFreq(const BlockT *BB, BlockFrequency Freq);
 
   void forgetBlock(const BlockT *BB) {
     // We don't erase corresponding items from `Freqs`, `RPOT` and other to
@@ -1068,11 +1067,6 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   raw_ostream &print(raw_ostream &OS) const override;
 
   using BlockFrequencyInfoImplBase::dump;
-  using BlockFrequencyInfoImplBase::printBlockFreq;
-
-  raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
-    return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
-  }
 
   void verifyMatch(BlockFrequencyInfoImpl<BT> &Other) const;
 };
@@ -1145,12 +1139,13 @@ void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
     // blocks and unknown blocks.
     for (const BlockT &BB : F)
       if (!Nodes.count(&BB))
-        setBlockFreq(&BB, 0);
+        setBlockFreq(&BB, BlockFrequency());
   }
 }
 
 template <class BT>
-void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB, uint64_t Freq) {
+void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB,
+                                              BlockFrequency Freq) {
   if (Nodes.count(BB))
     BlockFrequencyInfoImplBase::setBlockFreq(getNode(BB), Freq);
   else {
@@ -1862,7 +1857,7 @@ struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits {
       OS << Node->getName() << " : ";
     switch (GType) {
     case GVDT_Fraction:
-      Graph->printBlockFreq(OS, Node);
+      OS << printBlockFreq(*Graph, *Node);
       break;
     case GVDT_Integer:
       OS << Graph->getBlockFreq(Node).getFrequency();
diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index 401119b3cb8524..92005da1f4c61e 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -118,7 +118,15 @@ struct SimplifyQuery {
                 bool CanUseUndef = true)
       : DL(DL), TLI(TLI), DT(DT), AC(AC), CxtI(CXTI), IIQ(UseInstrInfo),
         CanUseUndef(CanUseUndef) {}
-  SimplifyQuery getWithInstruction(Instruction *I) const {
+
+  SimplifyQuery(const DataLayout &DL, const DominatorTree *DT,
+                AssumptionCache *AC = nullptr,
+                const Instruction *CXTI = nullptr, bool UseInstrInfo = true,
+                bool CanUseUndef = true)
+      : DL(DL), DT(DT), AC(AC), CxtI(CXTI), IIQ(UseInstrInfo),
+        CanUseUndef(CanUseUndef) {}
+
+  SimplifyQuery getWithInstruction(const Instruction *I) const {
     SimplifyQuery Copy(*this);
     Copy.CxtI = I;
     return Copy;
diff --git a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index 38eb71ba271d06..e49538bfaf80fb 100644
--- a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -209,7 +209,7 @@ class ProfileSummaryInfo {
 
   template <typename BFIT>
   bool isColdBlock(BlockFrequency BlockFreq, const BFIT *BFI) const {
-    auto Count = BFI->getProfileCountFromFreq(BlockFreq.getFrequency());
+    auto Count = BFI->getProfileCountFromFreq(BlockFreq);
     return Count && isColdCount(*Count);
   }
 
@@ -315,7 +315,7 @@ class ProfileSummaryInfo {
   bool isHotOrColdBlockNthPercentile(int PercentileCutoff,
                                      BlockFrequency BlockFreq,
                                      BFIT *BFI) const {
-    auto Count = BFI->getProfileCountFromFreq(BlockFreq.getFrequency());
+    auto Count = BFI->getProfileCountFromFreq(BlockFreq);
     if (isHot)
       return Count && isHotCountNthPercentile(PercentileCutoff, *Count);
     else
diff --git a/llvm/include/llvm/Analysis/TargetFolder.h b/llvm/include/llvm/Analysis/TargetFolder.h
index 4a6aec98abf861..978e1002515fc0 100644
--- a/llvm/include/llvm/Analysis/TargetFolder.h
+++ b/llvm/include/llvm/Analysis/TargetFolder.h
@@ -187,7 +187,7 @@ class TargetFolder final : public IRBuilderFolder {
   Value *FoldCast(Instruction::CastOps Op, Value *V,
                   Type *DestTy) const override {
     if (auto *C = dyn_cast<Constant>(V))
-      return Fold(ConstantExpr::getCast(Op, C, DestTy));
+      return ConstantFoldCastOperand(Op, C, DestTy, DL);
     return nullptr;
   }
 
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 5d62e837c1f3d5..2ffd4d4b714394 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -24,14 +24,46 @@ class Function;
 class Module;
 class Triple;
 
-/// Describes a possible vectorization of a function.
-/// Function 'VectorFnName' is equivalent to 'ScalarFnName' vectorized
-/// by a factor 'VectorizationFactor'.
-struct VecDesc {
+/// Provides info so a possible vectorization of a function can be
+/// computed. Function 'VectorFnName' is equivalent to 'ScalarFnName'
+/// vectorized by a factor 'VectorizationFactor'.
+/// The VABIPrefix string holds information about isa, mask, vlen,
+/// and vparams so a scalar-to-vector mapping of the form:
+///    _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>)
+/// can be constructed where:
+///
+/// <isa> = "_LLVM_"
+/// <mask> = "M" if masked, "N" if no mask.
+/// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor`
+///          field of the `VecDesc` struct. If the number of lanes is scalable
+///          then 'x' is printed instead.
+/// <vparams> = "v", as many as are the numArgs.
+/// <scalarname> = the name of the scalar function.
+/// <vectorname> = the name of the vector function.
+class VecDesc {
   StringRef ScalarFnName;
   StringRef VectorFnName;
   ElementCount VectorizationFactor;
   bool Masked;
+  StringRef VABIPrefix;
+
+public:
+  VecDesc() = delete;
+  VecDesc(StringRef ScalarFnName, StringRef VectorFnName,
+          ElementCount VectorizationFactor, bool Masked, StringRef VABIPrefix)
+      : ScalarFnName(ScalarFnName), VectorFnName(VectorFnName),
+        VectorizationFactor(VectorizationFactor), Masked(Masked),
+        VABIPrefix(VABIPrefix) {}
+
+  StringRef getScalarFnName() const { return ScalarFnName; }
+  StringRef getVectorFnName() const { return VectorFnName; }
+  ElementCount getVectorizationFactor() const { return VectorizationFactor; }
+  bool isMasked() const { return Masked; }
+  StringRef getVABIPrefix() const { return VABIPrefix; }
+
+  /// Returns a vector function ABI variant string on the form:
+  ///    _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>)
+  std::string getVectorFunctionABIVariantString() const;
 };
 
   enum LibFunc : unsigned {
@@ -176,6 +208,12 @@ class TargetLibraryInfoImpl {
   StringRef getVectorizedFunction(StringRef F, const ElementCount &VF,
                                   bool Masked) const;
 
+  /// Return a pointer to a VecDesc object holding all info for scalar to vector
+  /// mappings in TLI for the equivalent of F, vectorized with factor VF.
+  /// If no such mapping exists, return nullpointer.
+  const VecDesc *getVectorMappingInfo(StringRef F, const ElementCount &VF,
+                                      bool Masked) const;
+
   /// Set to true iff i32 parameters to library functions should have signext
   /// or zeroext attributes if they correspond to C-level int or unsigned int,
   /// respectively.
@@ -354,6 +392,10 @@ class TargetLibraryInfo {
                                   bool Masked = false) const {
     return Impl->getVectorizedFunction(F, VF, Masked);
   }
+  const VecDesc *getVectorMappingInfo(StringRef F, const ElementCount &VF,
+                                      bool Masked) const {
+    return Impl->getVectorMappingInfo(F, VF, Masked);
+  }
 
   /// Tests if the function is both available and a candidate for optimized code
   /// generation.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ae595d2110457..5234ef8788d9e9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -348,6 +348,9 @@ class TargetTransformInfo {
   /// individual classes of instructions would be better.
   unsigned getInliningThresholdMultiplier() const;
 
+  unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const;
+  unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const;
+
   /// \returns A value to be added to the inlining threshold.
   unsigned adjustInliningThreshold(const CallBase *CB) const;
 
@@ -1696,6 +1699,9 @@ class TargetTransformInfo::Concept {
                        const TTI::PointersChainInfo &Info, Type *AccessTy,
                        TTI::TargetCostKind CostKind) = 0;
   virtual unsigned getInliningThresholdMultiplier() const = 0;
+  virtual unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const = 0;
+  virtual unsigned
+  getInliningCostBenefitAnalysisProfitableMultiplier() const = 0;
   virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
   virtual int getInlinerVectorBonusPercent() const = 0;
   virtual unsigned getCallerAllocaCost(const CallBase *CB,
@@ -2068,6 +2074,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned adjustInliningThreshold(const CallBase *CB) override {
     return Impl.adjustInliningThreshold(CB);
   }
+  unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const override {
+    return Impl.getInliningCostBenefitAnalysisSavingsMultiplier();
+  }
+  unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const override {
+    return Impl.getInliningCostBenefitAnalysisProfitableMultiplier();
+  }
   int getInlinerVectorBonusPercent() const override {
     return Impl.getInlinerVectorBonusPercent();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 326c3130c6cff7..c1ff314ae51c98 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -69,6 +69,10 @@ class TargetTransformInfoImplBase {
   }
 
   unsigned getInliningThresholdMultiplier() const { return 1; }
+  unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { return 8; }
+  unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
+    return 8;
+  }
   unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; }
   unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
     return 0;
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 695f2fecae885b..a2f5f23d94ee81 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -243,6 +243,10 @@ struct KnownFPClass {
   /// definitely set or false if the sign bit is definitely unset.
   std::optional<bool> SignBit;
 
+  bool operator==(KnownFPClass Other) const {
+    return KnownFPClasses == Other.KnownFPClasses && SignBit == Other.SignBit;
+  }
+
   /// Return true if it's known this can never be one of the mask entries.
   bool isKnownNever(FPClassTest Mask) const {
     return (KnownFPClasses & Mask) == fcNone;
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 98bcfe3843669f..b2e989e4013ea0 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -14,7 +14,7 @@
 
 #if defined(TLI_DEFINE_MASSV_VECFUNCS_NAMES)
 #define TLI_DEFINE_MASSV_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) VEC,
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX) VEC,
 #endif
 
 #define FIXED(NL) ElementCount::getFixed(NL)
@@ -23,860 +23,860 @@
 #define MASKED true
 
 #if !(defined(TLI_DEFINE_VECFUNC))
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) {SCAL, VEC, VF, NOMASK},
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX) {SCAL, VEC, VF, NOMASK, VABI_PREFIX},
 #endif
 
 #if defined(TLI_DEFINE_ACCELERATE_VECFUNCS)
 // Accelerate framework's Vector Functions
 
 // Floating-Point Arithmetic and Auxiliary Functions
-TLI_DEFINE_VECFUNC("ceilf", "vceilf", FIXED(4))
-TLI_DEFINE_VECFUNC("fabsf", "vfabsf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.fabs.f32", "vfabsf", FIXED(4))
-TLI_DEFINE_VECFUNC("floorf", "vfloorf", FIXED(4))
-TLI_DEFINE_VECFUNC("sqrtf", "vsqrtf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.sqrt.f32", "vsqrtf", FIXED(4))
+TLI_DEFINE_VECFUNC("ceilf", "vceilf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("fabsf", "vfabsf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.fabs.f32", "vfabsf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("floorf", "vfloorf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sqrtf", "vsqrtf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sqrt.f32", "vsqrtf", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Exponential and Logarithmic Functions
-TLI_DEFINE_VECFUNC("expf", "vexpf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "vexpf", FIXED(4))
-TLI_DEFINE_VECFUNC("expm1f", "vexpm1f", FIXED(4))
-TLI_DEFINE_VECFUNC("logf", "vlogf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log.f32", "vlogf", FIXED(4))
-TLI_DEFINE_VECFUNC("log1pf", "vlog1pf", FIXED(4))
-TLI_DEFINE_VECFUNC("log10f", "vlog10f", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log10.f32", "vlog10f", FIXED(4))
-TLI_DEFINE_VECFUNC("logbf", "vlogbf", FIXED(4))
+TLI_DEFINE_VECFUNC("expf", "vexpf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "vexpf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("expm1f", "vexpm1f", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("logf", "vlogf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log.f32", "vlogf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log1pf", "vlog1pf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log10f", "vlog10f", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "vlog10f", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("logbf", "vlogbf", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Trigonometric Functions
-TLI_DEFINE_VECFUNC("sinf", "vsinf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "vsinf", FIXED(4))
-TLI_DEFINE_VECFUNC("cosf", "vcosf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "vcosf", FIXED(4))
-TLI_DEFINE_VECFUNC("tanf", "vtanf", FIXED(4))
-TLI_DEFINE_VECFUNC("asinf", "vasinf", FIXED(4))
-TLI_DEFINE_VECFUNC("acosf", "vacosf", FIXED(4))
-TLI_DEFINE_VECFUNC("atanf", "vatanf", FIXED(4))
+TLI_DEFINE_VECFUNC("sinf", "vsinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "vsinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cosf", "vcosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "vcosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("tanf", "vtanf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("asinf", "vasinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("acosf", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atanf", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Hyperbolic Functions
-TLI_DEFINE_VECFUNC("sinhf", "vsinhf", FIXED(4))
-TLI_DEFINE_VECFUNC("coshf", "vcoshf", FIXED(4))
-TLI_DEFINE_VECFUNC("tanhf", "vtanhf", FIXED(4))
-TLI_DEFINE_VECFUNC("asinhf", "vasinhf", FIXED(4))
-TLI_DEFINE_VECFUNC("acoshf", "vacoshf", FIXED(4))
-TLI_DEFINE_VECFUNC("atanhf", "vatanhf", FIXED(4))
+TLI_DEFINE_VECFUNC("sinhf", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("coshf", "vcoshf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("tanhf", "vtanhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("asinhf", "vasinhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("acoshf", "vacoshf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atanhf", "vatanhf", FIXED(4), "_ZGV_LLVM_N4v")
 
 #elif defined(TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS)
 // Darwin libsystem_m vector functions.
 
 // Exponential and Logarithmic Functions
-TLI_DEFINE_VECFUNC("exp", "_simd_exp_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "_simd_exp_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("expf", "_simd_exp_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "_simd_exp_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("exp", "_simd_exp_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_simd_exp_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("expf", "_simd_exp_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_simd_exp_f4", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Trigonometric Functions
-TLI_DEFINE_VECFUNC("acos", "_simd_acos_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("acosf", "_simd_acos_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("asin", "_simd_asin_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("asinf", "_simd_asin_f4", FIXED(4))
-
-TLI_DEFINE_VECFUNC("atan", "_simd_atan_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("atanf", "_simd_atan_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("atan2", "_simd_atan2_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("atan2f", "_simd_atan2_f4", FIXED(4))
-
-TLI_DEFINE_VECFUNC("cos", "_simd_cos_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "_simd_cos_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("cosf", "_simd_cos_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "_simd_cos_f4", FIXED(4))
-
-TLI_DEFINE_VECFUNC("sin", "_simd_sin_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "_simd_sin_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("sinf", "_simd_sin_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "_simd_sin_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("acos", "_simd_acos_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("acosf", "_simd_acos_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("asin", "_simd_asin_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("asinf", "_simd_asin_f4", FIXED(4), "_ZGV_LLVM_N4v")
+
+TLI_DEFINE_VECFUNC("atan", "_simd_atan_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("atanf", "_simd_atan_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atan2", "_simd_atan2_d2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("atan2f", "_simd_atan2_f4", FIXED(4), "_ZGV_LLVM_N4vv")
+
+TLI_DEFINE_VECFUNC("cos", "_simd_cos_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_simd_cos_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cosf", "_simd_cos_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_simd_cos_f4", FIXED(4), "_ZGV_LLVM_N4v")
+
+TLI_DEFINE_VECFUNC("sin", "_simd_sin_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_simd_sin_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sinf", "_simd_sin_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_simd_sin_f4", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Floating-Point Arithmetic and Auxiliary Functions
-TLI_DEFINE_VECFUNC("cbrt", "_simd_cbrt_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("cbrtf", "_simd_cbrt_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("erf", "_simd_erf_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("erff", "_simd_erf_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("pow", "_simd_pow_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "_simd_pow_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("powf", "_simd_pow_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "_simd_pow_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("cbrt", "_simd_cbrt_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cbrtf", "_simd_cbrt_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("erf", "_simd_erf_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("erff", "_simd_erf_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("pow", "_simd_pow_d2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_simd_pow_d2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("powf", "_simd_pow_f4", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_simd_pow_f4", FIXED(4), "_ZGV_LLVM_N4vv")
 
 // Hyperbolic Functions
-TLI_DEFINE_VECFUNC("sinh", "_simd_sinh_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("sinhf", "_simd_sinh_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("cosh", "_simd_cosh_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("coshf", "_simd_cosh_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("tanh", "_simd_tanh_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("tanhf", "_simd_tanh_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("asinh", "_simd_asinh_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("asinhf", "_simd_asinh_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("acosh", "_simd_acosh_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("acoshf", "_simd_acosh_f4", FIXED(4))
-TLI_DEFINE_VECFUNC("atanh", "_simd_atanh_d2", FIXED(2))
-TLI_DEFINE_VECFUNC("atanhf", "_simd_atanh_f4", FIXED(4))
+TLI_DEFINE_VECFUNC("sinh", "_simd_sinh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sinhf", "_simd_sinh_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cosh", "_simd_cosh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("coshf", "_simd_cosh_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("tanh", "_simd_tanh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("tanhf", "_simd_tanh_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("asinh", "_simd_asinh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("asinhf", "_simd_asinh_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("acosh", "_simd_acosh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("acoshf", "_simd_acosh_f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atanh", "_simd_atanh_d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("atanhf", "_simd_atanh_f4", FIXED(4), "_ZGV_LLVM_N4v")
 
 #elif defined(TLI_DEFINE_LIBMVEC_X86_VECFUNCS)
 // GLIBC Vector math Functions
 
-TLI_DEFINE_VECFUNC("sin", "_ZGVbN2v_sin", FIXED(2))
-TLI_DEFINE_VECFUNC("sin", "_ZGVdN4v_sin", FIXED(4))
+TLI_DEFINE_VECFUNC("sin", "_ZGVbN2v_sin", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sin", "_ZGVdN4v_sin", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("sinf", "_ZGVbN4v_sinf", FIXED(4))
-TLI_DEFINE_VECFUNC("sinf", "_ZGVdN8v_sinf", FIXED(8))
+TLI_DEFINE_VECFUNC("sinf", "_ZGVbN4v_sinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sinf", "_ZGVdN8v_sinf", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVbN2v_sin", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVdN4v_sin", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVbN2v_sin", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVdN4v_sin", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVbN4v_sinf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVdN8v_sinf", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVbN4v_sinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVdN8v_sinf", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("cos", "_ZGVbN2v_cos", FIXED(2))
-TLI_DEFINE_VECFUNC("cos", "_ZGVdN4v_cos", FIXED(4))
+TLI_DEFINE_VECFUNC("cos", "_ZGVbN2v_cos", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cos", "_ZGVdN4v_cos", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("cosf", "_ZGVbN4v_cosf", FIXED(4))
-TLI_DEFINE_VECFUNC("cosf", "_ZGVdN8v_cosf", FIXED(8))
+TLI_DEFINE_VECFUNC("cosf", "_ZGVbN4v_cosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cosf", "_ZGVdN8v_cosf", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVbN2v_cos", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVdN4v_cos", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVbN2v_cos", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVdN4v_cos", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVbN4v_cosf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVdN8v_cosf", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVbN4v_cosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVdN8v_cosf", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("pow", "_ZGVbN2vv_pow", FIXED(2))
-TLI_DEFINE_VECFUNC("pow", "_ZGVdN4vv_pow", FIXED(4))
+TLI_DEFINE_VECFUNC("pow", "_ZGVbN2vv_pow", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("pow", "_ZGVdN4vv_pow", FIXED(4), "_ZGV_LLVM_N4vv")
 
-TLI_DEFINE_VECFUNC("powf", "_ZGVbN4vv_powf", FIXED(4))
-TLI_DEFINE_VECFUNC("powf", "_ZGVdN8vv_powf", FIXED(8))
+TLI_DEFINE_VECFUNC("powf", "_ZGVbN4vv_powf", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("powf", "_ZGVdN8vv_powf", FIXED(8), "_ZGV_LLVM_N8vv")
 
-TLI_DEFINE_VECFUNC("__pow_finite", "_ZGVbN2vv___pow_finite", FIXED(2))
-TLI_DEFINE_VECFUNC("__pow_finite", "_ZGVdN4vv___pow_finite", FIXED(4))
+TLI_DEFINE_VECFUNC("__pow_finite", "_ZGVbN2vv___pow_finite", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("__pow_finite", "_ZGVdN4vv___pow_finite", FIXED(4), "_ZGV_LLVM_N4vv")
 
-TLI_DEFINE_VECFUNC("__powf_finite", "_ZGVbN4vv___powf_finite", FIXED(4))
-TLI_DEFINE_VECFUNC("__powf_finite", "_ZGVdN8vv___powf_finite", FIXED(8))
+TLI_DEFINE_VECFUNC("__powf_finite", "_ZGVbN4vv___powf_finite", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("__powf_finite", "_ZGVdN8vv___powf_finite", FIXED(8), "_ZGV_LLVM_N8vv")
 
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4), "_ZGV_LLVM_N4vv")
 
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVbN4vv_powf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVdN8vv_powf", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVbN4vv_powf", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVdN8vv_powf", FIXED(8), "_ZGV_LLVM_N8vv")
 
-TLI_DEFINE_VECFUNC("exp", "_ZGVbN2v_exp", FIXED(2))
-TLI_DEFINE_VECFUNC("exp", "_ZGVdN4v_exp", FIXED(4))
+TLI_DEFINE_VECFUNC("exp", "_ZGVbN2v_exp", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("exp", "_ZGVdN4v_exp", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("expf", "_ZGVbN4v_expf", FIXED(4))
-TLI_DEFINE_VECFUNC("expf", "_ZGVdN8v_expf", FIXED(8))
+TLI_DEFINE_VECFUNC("expf", "_ZGVbN4v_expf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("expf", "_ZGVdN8v_expf", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("__exp_finite", "_ZGVbN2v___exp_finite", FIXED(2))
-TLI_DEFINE_VECFUNC("__exp_finite", "_ZGVdN4v___exp_finite", FIXED(4))
+TLI_DEFINE_VECFUNC("__exp_finite", "_ZGVbN2v___exp_finite", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__exp_finite", "_ZGVdN4v___exp_finite", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("__expf_finite", "_ZGVbN4v___expf_finite", FIXED(4))
-TLI_DEFINE_VECFUNC("__expf_finite", "_ZGVdN8v___expf_finite", FIXED(8))
+TLI_DEFINE_VECFUNC("__expf_finite", "_ZGVbN4v___expf_finite", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__expf_finite", "_ZGVdN8v___expf_finite", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVbN2v_exp", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVdN4v_exp", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVbN2v_exp", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVdN4v_exp", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVbN4v_expf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVdN8v_expf", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVbN4v_expf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVdN8v_expf", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("log", "_ZGVbN2v_log", FIXED(2))
-TLI_DEFINE_VECFUNC("log", "_ZGVdN4v_log", FIXED(4))
+TLI_DEFINE_VECFUNC("log", "_ZGVbN2v_log", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log", "_ZGVdN4v_log", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("logf", "_ZGVbN4v_logf", FIXED(4))
-TLI_DEFINE_VECFUNC("logf", "_ZGVdN8v_logf", FIXED(8))
+TLI_DEFINE_VECFUNC("logf", "_ZGVbN4v_logf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("logf", "_ZGVdN8v_logf", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("__log_finite", "_ZGVbN2v___log_finite", FIXED(2))
-TLI_DEFINE_VECFUNC("__log_finite", "_ZGVdN4v___log_finite", FIXED(4))
+TLI_DEFINE_VECFUNC("__log_finite", "_ZGVbN2v___log_finite", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__log_finite", "_ZGVdN4v___log_finite", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("__logf_finite", "_ZGVbN4v___logf_finite", FIXED(4))
-TLI_DEFINE_VECFUNC("__logf_finite", "_ZGVdN8v___logf_finite", FIXED(8))
+TLI_DEFINE_VECFUNC("__logf_finite", "_ZGVbN4v___logf_finite", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__logf_finite", "_ZGVdN8v___logf_finite", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVbN2v_log", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVdN4v_log", FIXED(4))
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVbN2v_log", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVdN4v_log", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVbN4v_logf", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVdN8v_logf", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVbN4v_logf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVdN8v_logf", FIXED(8), "_ZGV_LLVM_N8v")
 
 #elif defined(TLI_DEFINE_MASSV_VECFUNCS)
 // IBM MASS library's vector Functions
 
 // Floating-Point Arithmetic and Auxiliary Functions
-TLI_DEFINE_VECFUNC("cbrt", "__cbrtd2", FIXED(2))
-TLI_DEFINE_VECFUNC("cbrtf", "__cbrtf4", FIXED(4))
-TLI_DEFINE_VECFUNC("pow", "__powd2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "__powd2", FIXED(2))
-TLI_DEFINE_VECFUNC("powf", "__powf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "__powf4", FIXED(4))
+TLI_DEFINE_VECFUNC("cbrt", "__cbrtd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cbrtf", "__cbrtf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("pow", "__powd2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "__powd2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("powf", "__powf4", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "__powf4", FIXED(4), "_ZGV_LLVM_N4vv")
 
 // Exponential and Logarithmic Functions
-TLI_DEFINE_VECFUNC("exp", "__expd2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "__expd2", FIXED(2))
-TLI_DEFINE_VECFUNC("expf", "__expf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "__expf4", FIXED(4))
-TLI_DEFINE_VECFUNC("exp2", "__exp2d2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.exp2.f64", "__exp2d2", FIXED(2))
-TLI_DEFINE_VECFUNC("exp2f", "__exp2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp2.f32", "__exp2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("expm1", "__expm1d2", FIXED(2))
-TLI_DEFINE_VECFUNC("expm1f", "__expm1f4", FIXED(4))
-TLI_DEFINE_VECFUNC("log", "__logd2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.log.f64", "__logd2", FIXED(2))
-TLI_DEFINE_VECFUNC("logf", "__logf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log.f32", "__logf4", FIXED(4))
-TLI_DEFINE_VECFUNC("log1p", "__log1pd2", FIXED(2))
-TLI_DEFINE_VECFUNC("log1pf", "__log1pf4", FIXED(4))
-TLI_DEFINE_VECFUNC("log10", "__log10d2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.log10.f64", "__log10d2", FIXED(2))
-TLI_DEFINE_VECFUNC("log10f", "__log10f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log10.f32", "__log10f4", FIXED(4))
-TLI_DEFINE_VECFUNC("log2", "__log2d2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.log2.f64", "__log2d2", FIXED(2))
-TLI_DEFINE_VECFUNC("log2f", "__log2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log2.f32", "__log2f4", FIXED(4))
+TLI_DEFINE_VECFUNC("exp", "__expd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "__expd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("expf", "__expf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "__expf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("exp2", "__exp2d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "__exp2d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("exp2f", "__exp2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "__exp2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("expm1", "__expm1d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("expm1f", "__expm1f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log", "__logd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log.f64", "__logd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("logf", "__logf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log.f32", "__logf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log1p", "__log1pd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log1pf", "__log1pf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log10", "__log10d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "__log10d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log10f", "__log10f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "__log10f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log2", "__log2d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "__log2d2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log2f", "__log2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "__log2f4", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Trigonometric Functions
-TLI_DEFINE_VECFUNC("sin", "__sind2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "__sind2", FIXED(2))
-TLI_DEFINE_VECFUNC("sinf", "__sinf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "__sinf4", FIXED(4))
-TLI_DEFINE_VECFUNC("cos", "__cosd2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "__cosd2", FIXED(2))
-TLI_DEFINE_VECFUNC("cosf", "__cosf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "__cosf4", FIXED(4))
-TLI_DEFINE_VECFUNC("tan", "__tand2", FIXED(2))
-TLI_DEFINE_VECFUNC("tanf", "__tanf4", FIXED(4))
-TLI_DEFINE_VECFUNC("asin", "__asind2", FIXED(2))
-TLI_DEFINE_VECFUNC("asinf", "__asinf4", FIXED(4))
-TLI_DEFINE_VECFUNC("acos", "__acosd2", FIXED(2))
-TLI_DEFINE_VECFUNC("acosf", "__acosf4", FIXED(4))
-TLI_DEFINE_VECFUNC("atan", "__atand2", FIXED(2))
-TLI_DEFINE_VECFUNC("atanf", "__atanf4", FIXED(4))
-TLI_DEFINE_VECFUNC("atan2", "__atan2d2", FIXED(2))
-TLI_DEFINE_VECFUNC("atan2f", "__atan2f4", FIXED(4))
+TLI_DEFINE_VECFUNC("sin", "__sind2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "__sind2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sinf", "__sinf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "__sinf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cos", "__cosd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "__cosd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cosf", "__cosf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "__cosf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("tan", "__tand2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("tanf", "__tanf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("asin", "__asind2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("asinf", "__asinf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("acos", "__acosd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("acosf", "__acosf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atan", "__atand2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("atanf", "__atanf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atan2", "__atan2d2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("atan2f", "__atan2f4", FIXED(4), "_ZGV_LLVM_N4vv")
 
 // Hyperbolic Functions
-TLI_DEFINE_VECFUNC("sinh", "__sinhd2", FIXED(2))
-TLI_DEFINE_VECFUNC("sinhf", "__sinhf4", FIXED(4))
-TLI_DEFINE_VECFUNC("cosh", "__coshd2", FIXED(2))
-TLI_DEFINE_VECFUNC("coshf", "__coshf4", FIXED(4))
-TLI_DEFINE_VECFUNC("tanh", "__tanhd2", FIXED(2))
-TLI_DEFINE_VECFUNC("tanhf", "__tanhf4", FIXED(4))
-TLI_DEFINE_VECFUNC("asinh", "__asinhd2", FIXED(2))
-TLI_DEFINE_VECFUNC("asinhf", "__asinhf4", FIXED(4))
-TLI_DEFINE_VECFUNC("acosh", "__acoshd2", FIXED(2))
-TLI_DEFINE_VECFUNC("acoshf", "__acoshf4", FIXED(4))
-TLI_DEFINE_VECFUNC("atanh", "__atanhd2", FIXED(2))
-TLI_DEFINE_VECFUNC("atanhf", "__atanhf4", FIXED(4))
+TLI_DEFINE_VECFUNC("sinh", "__sinhd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sinhf", "__sinhf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cosh", "__coshd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("coshf", "__coshf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("tanh", "__tanhd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("tanhf", "__tanhf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("asinh", "__asinhd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("asinhf", "__asinhf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("acosh", "__acoshd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("acoshf", "__acoshf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atanh", "__atanhd2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("atanhf", "__atanhf4", FIXED(4), "_ZGV_LLVM_N4v")
 
 
 #elif defined(TLI_DEFINE_SVML_VECFUNCS)
 // Intel SVM library's Vector Functions
 
-TLI_DEFINE_VECFUNC("sin", "__svml_sin2", FIXED(2))
-TLI_DEFINE_VECFUNC("sin", "__svml_sin4", FIXED(4))
-TLI_DEFINE_VECFUNC("sin", "__svml_sin8", FIXED(8))
+TLI_DEFINE_VECFUNC("sin", "__svml_sin2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sin", "__svml_sin4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sin", "__svml_sin8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("sinf", "__svml_sinf4", FIXED(4))
-TLI_DEFINE_VECFUNC("sinf", "__svml_sinf8", FIXED(8))
-TLI_DEFINE_VECFUNC("sinf", "__svml_sinf16", FIXED(16))
+TLI_DEFINE_VECFUNC("sinf", "__svml_sinf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sinf", "__svml_sinf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("sinf", "__svml_sinf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "__svml_sin2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "__svml_sin4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "__svml_sin8", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "__svml_sin2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "__svml_sin4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "__svml_sin8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "__svml_sinf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "__svml_sinf8", FIXED(8))
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "__svml_sinf16", FIXED(16))
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "__svml_sinf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "__svml_sinf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "__svml_sinf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("cos", "__svml_cos2", FIXED(2))
-TLI_DEFINE_VECFUNC("cos", "__svml_cos4", FIXED(4))
-TLI_DEFINE_VECFUNC("cos", "__svml_cos8", FIXED(8))
+TLI_DEFINE_VECFUNC("cos", "__svml_cos2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cos", "__svml_cos4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cos", "__svml_cos8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("cosf", "__svml_cosf4", FIXED(4))
-TLI_DEFINE_VECFUNC("cosf", "__svml_cosf8", FIXED(8))
-TLI_DEFINE_VECFUNC("cosf", "__svml_cosf16", FIXED(16))
+TLI_DEFINE_VECFUNC("cosf", "__svml_cosf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cosf", "__svml_cosf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("cosf", "__svml_cosf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "__svml_cos2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "__svml_cos4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "__svml_cos8", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "__svml_cos2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "__svml_cos4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "__svml_cos8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf8", FIXED(8))
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf16", FIXED(16))
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("pow", "__svml_pow2", FIXED(2))
-TLI_DEFINE_VECFUNC("pow", "__svml_pow4", FIXED(4))
-TLI_DEFINE_VECFUNC("pow", "__svml_pow8", FIXED(8))
+TLI_DEFINE_VECFUNC("pow", "__svml_pow2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("pow", "__svml_pow4", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("pow", "__svml_pow8", FIXED(8), "_ZGV_LLVM_N8vv")
 
-TLI_DEFINE_VECFUNC("powf", "__svml_powf4", FIXED(4))
-TLI_DEFINE_VECFUNC("powf", "__svml_powf8", FIXED(8))
-TLI_DEFINE_VECFUNC("powf", "__svml_powf16", FIXED(16))
+TLI_DEFINE_VECFUNC("powf", "__svml_powf4", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("powf", "__svml_powf8", FIXED(8), "_ZGV_LLVM_N8vv")
+TLI_DEFINE_VECFUNC("powf", "__svml_powf16", FIXED(16), "_ZGV_LLVM_N16vv")
 
-TLI_DEFINE_VECFUNC("__pow_finite", "__svml_pow2", FIXED(2))
-TLI_DEFINE_VECFUNC("__pow_finite", "__svml_pow4", FIXED(4))
-TLI_DEFINE_VECFUNC("__pow_finite", "__svml_pow8", FIXED(8))
+TLI_DEFINE_VECFUNC("__pow_finite", "__svml_pow2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("__pow_finite", "__svml_pow4", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("__pow_finite", "__svml_pow8", FIXED(8), "_ZGV_LLVM_N8vv")
 
-TLI_DEFINE_VECFUNC("__powf_finite", "__svml_powf4", FIXED(4))
-TLI_DEFINE_VECFUNC("__powf_finite", "__svml_powf8", FIXED(8))
-TLI_DEFINE_VECFUNC("__powf_finite", "__svml_powf16", FIXED(16))
+TLI_DEFINE_VECFUNC("__powf_finite", "__svml_powf4", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("__powf_finite", "__svml_powf8", FIXED(8), "_ZGV_LLVM_N8vv")
+TLI_DEFINE_VECFUNC("__powf_finite", "__svml_powf16", FIXED(16), "_ZGV_LLVM_N16vv")
 
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "__svml_pow2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "__svml_pow4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "__svml_pow8", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "__svml_pow2", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "__svml_pow4", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "__svml_pow8", FIXED(8), "_ZGV_LLVM_N8vv")
 
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "__svml_powf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "__svml_powf8", FIXED(8))
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "__svml_powf16", FIXED(16))
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "__svml_powf4", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "__svml_powf8", FIXED(8), "_ZGV_LLVM_N8vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "__svml_powf16", FIXED(16), "_ZGV_LLVM_N16vv")
 
-TLI_DEFINE_VECFUNC("exp", "__svml_exp2", FIXED(2))
-TLI_DEFINE_VECFUNC("exp", "__svml_exp4", FIXED(4))
-TLI_DEFINE_VECFUNC("exp", "__svml_exp8", FIXED(8))
+TLI_DEFINE_VECFUNC("exp", "__svml_exp2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("exp", "__svml_exp4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("exp", "__svml_exp8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("expf", "__svml_expf4", FIXED(4))
-TLI_DEFINE_VECFUNC("expf", "__svml_expf8", FIXED(8))
-TLI_DEFINE_VECFUNC("expf", "__svml_expf16", FIXED(16))
+TLI_DEFINE_VECFUNC("expf", "__svml_expf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("expf", "__svml_expf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("expf", "__svml_expf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("__exp_finite", "__svml_exp2", FIXED(2))
-TLI_DEFINE_VECFUNC("__exp_finite", "__svml_exp4", FIXED(4))
-TLI_DEFINE_VECFUNC("__exp_finite", "__svml_exp8", FIXED(8))
+TLI_DEFINE_VECFUNC("__exp_finite", "__svml_exp2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__exp_finite", "__svml_exp4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__exp_finite", "__svml_exp8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("__expf_finite", "__svml_expf4", FIXED(4))
-TLI_DEFINE_VECFUNC("__expf_finite", "__svml_expf8", FIXED(8))
-TLI_DEFINE_VECFUNC("__expf_finite", "__svml_expf16", FIXED(16))
+TLI_DEFINE_VECFUNC("__expf_finite", "__svml_expf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__expf_finite", "__svml_expf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__expf_finite", "__svml_expf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "__svml_exp2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "__svml_exp4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "__svml_exp8", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "__svml_exp2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "__svml_exp4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "__svml_exp8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "__svml_expf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "__svml_expf8", FIXED(8))
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "__svml_expf16", FIXED(16))
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "__svml_expf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "__svml_expf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "__svml_expf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("log", "__svml_log2", FIXED(2))
-TLI_DEFINE_VECFUNC("log", "__svml_log4", FIXED(4))
-TLI_DEFINE_VECFUNC("log", "__svml_log8", FIXED(8))
+TLI_DEFINE_VECFUNC("log", "__svml_log2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log", "__svml_log4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log", "__svml_log8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("logf", "__svml_logf4", FIXED(4))
-TLI_DEFINE_VECFUNC("logf", "__svml_logf8", FIXED(8))
-TLI_DEFINE_VECFUNC("logf", "__svml_logf16", FIXED(16))
+TLI_DEFINE_VECFUNC("logf", "__svml_logf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("logf", "__svml_logf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("logf", "__svml_logf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("__log_finite", "__svml_log2", FIXED(2))
-TLI_DEFINE_VECFUNC("__log_finite", "__svml_log4", FIXED(4))
-TLI_DEFINE_VECFUNC("__log_finite", "__svml_log8", FIXED(8))
+TLI_DEFINE_VECFUNC("__log_finite", "__svml_log2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__log_finite", "__svml_log4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__log_finite", "__svml_log8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("__logf_finite", "__svml_logf4", FIXED(4))
-TLI_DEFINE_VECFUNC("__logf_finite", "__svml_logf8", FIXED(8))
-TLI_DEFINE_VECFUNC("__logf_finite", "__svml_logf16", FIXED(16))
+TLI_DEFINE_VECFUNC("__logf_finite", "__svml_logf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__logf_finite", "__svml_logf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__logf_finite", "__svml_logf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("llvm.log.f64", "__svml_log2", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.log.f64", "__svml_log4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log.f64", "__svml_log8", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.log.f64", "__svml_log2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log.f64", "__svml_log4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log.f64", "__svml_log8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf8", FIXED(8))
-TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf16", FIXED(16))
+TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.log.f32", "__svml_logf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("log2", "__svml_log22", FIXED(2))
-TLI_DEFINE_VECFUNC("log2", "__svml_log24", FIXED(4))
-TLI_DEFINE_VECFUNC("log2", "__svml_log28", FIXED(8))
+TLI_DEFINE_VECFUNC("log2", "__svml_log22", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log2", "__svml_log24", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log2", "__svml_log28", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("log2f", "__svml_log2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("log2f", "__svml_log2f8", FIXED(8))
-TLI_DEFINE_VECFUNC("log2f", "__svml_log2f16", FIXED(16))
+TLI_DEFINE_VECFUNC("log2f", "__svml_log2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log2f", "__svml_log2f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("log2f", "__svml_log2f16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log22", FIXED(2))
-TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log24", FIXED(4))
-TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log28", FIXED(8))
+TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log22", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log24", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__log2_finite", "__svml_log28", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f8", FIXED(8))
-TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f16", FIXED(16))
+TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__log2f_finite", "__svml_log2f16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log22", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log24", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log28", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log22", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log24", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "__svml_log28", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f8", FIXED(8))
-TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f16", FIXED(16))
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "__svml_log2f16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("log10", "__svml_log102", FIXED(2))
-TLI_DEFINE_VECFUNC("log10", "__svml_log104", FIXED(4))
-TLI_DEFINE_VECFUNC("log10", "__svml_log108", FIXED(8))
+TLI_DEFINE_VECFUNC("log10", "__svml_log102", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log10", "__svml_log104", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log10", "__svml_log108", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("log10f", "__svml_log10f4", FIXED(4))
-TLI_DEFINE_VECFUNC("log10f", "__svml_log10f8", FIXED(8))
-TLI_DEFINE_VECFUNC("log10f", "__svml_log10f16", FIXED(16))
+TLI_DEFINE_VECFUNC("log10f", "__svml_log10f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log10f", "__svml_log10f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("log10f", "__svml_log10f16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log102", FIXED(2))
-TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log104", FIXED(4))
-TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log108", FIXED(8))
+TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log102", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log104", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__log10_finite", "__svml_log108", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f4", FIXED(4))
-TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f8", FIXED(8))
-TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f16", FIXED(16))
+TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__log10f_finite", "__svml_log10f16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log102", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log104", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log108", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log102", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log104", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "__svml_log108", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f8", FIXED(8))
-TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f16", FIXED(16))
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "__svml_log10f16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt2", FIXED(2))
-TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt4", FIXED(4))
-TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt8", FIXED(8))
+TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sqrt", "__svml_sqrt8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf4", FIXED(4))
-TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf8", FIXED(8))
-TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf16", FIXED(16))
+TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("sqrtf", "__svml_sqrtf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt2", FIXED(2))
-TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt4", FIXED(4))
-TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt8", FIXED(8))
+TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__sqrt_finite", "__svml_sqrt8", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf4", FIXED(4))
-TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf8", FIXED(8))
-TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf16", FIXED(16))
+TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__sqrtf_finite", "__svml_sqrtf16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("exp2", "__svml_exp22", FIXED(2))
-TLI_DEFINE_VECFUNC("exp2", "__svml_exp24", FIXED(4))
-TLI_DEFINE_VECFUNC("exp2", "__svml_exp28", FIXED(8))
+TLI_DEFINE_VECFUNC("exp2", "__svml_exp22", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("exp2", "__svml_exp24", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("exp2", "__svml_exp28", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("exp2f", "__svml_exp2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("exp2f", "__svml_exp2f8", FIXED(8))
-TLI_DEFINE_VECFUNC("exp2f", "__svml_exp2f16", FIXED(16))
+TLI_DEFINE_VECFUNC("exp2f", "__svml_exp2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("exp2f", "__svml_exp2f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("exp2f", "__svml_exp2f16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("llvm.exp2.f64", "__svml_exp22", FIXED(2))
-TLI_DEFINE_VECFUNC("llvm.exp2.f64", "__svml_exp24", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp2.f64", "__svml_exp28", FIXED(8))
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "__svml_exp22", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "__svml_exp24", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "__svml_exp28", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("llvm.exp2.f32", "__svml_exp2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("llvm.exp2.f32", "__svml_exp2f8", FIXED(8))
-TLI_DEFINE_VECFUNC("llvm.exp2.f32", "__svml_exp2f16", FIXED(16))
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "__svml_exp2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "__svml_exp2f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "__svml_exp2f16", FIXED(16), "_ZGV_LLVM_N16v")
 
-TLI_DEFINE_VECFUNC("__exp2_finite", "__svml_exp22", FIXED(2))
-TLI_DEFINE_VECFUNC("__exp2_finite", "__svml_exp24", FIXED(4))
-TLI_DEFINE_VECFUNC("__exp2_finite", "__svml_exp28", FIXED(8))
+TLI_DEFINE_VECFUNC("__exp2_finite", "__svml_exp22", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__exp2_finite", "__svml_exp24", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__exp2_finite", "__svml_exp28", FIXED(8), "_ZGV_LLVM_N8v")
 
-TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f4", FIXED(4))
-TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f8", FIXED(8))
-TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f16", FIXED(16))
+TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f4", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f8", FIXED(8), "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__exp2f_finite", "__svml_exp2f16", FIXED(16), "_ZGV_LLVM_N16v")
 
 #elif defined(TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS)
 
-TLI_DEFINE_VECFUNC( "acos", "_ZGVnN2v_acos", FIXED(2))
+TLI_DEFINE_VECFUNC( "acos", "_ZGVnN2v_acos", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "asin", "_ZGVnN2v_asin", FIXED(2))
+TLI_DEFINE_VECFUNC( "asin", "_ZGVnN2v_asin", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "atan", "_ZGVnN2v_atan", FIXED(2))
+TLI_DEFINE_VECFUNC( "atan", "_ZGVnN2v_atan", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "atan2", "_ZGVnN2vv_atan2", FIXED(2))
+TLI_DEFINE_VECFUNC( "atan2", "_ZGVnN2vv_atan2", FIXED(2), "_ZGV_LLVM_N2vv")
 
-TLI_DEFINE_VECFUNC( "atanh", "_ZGVnN2v_atanh", FIXED(2))
+TLI_DEFINE_VECFUNC( "atanh", "_ZGVnN2v_atanh", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "cos", "_ZGVnN2v_cos", FIXED(2))
-TLI_DEFINE_VECFUNC( "llvm.cos.f64", "_ZGVnN2v_cos", FIXED(2))
+TLI_DEFINE_VECFUNC( "cos", "_ZGVnN2v_cos", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC( "llvm.cos.f64", "_ZGVnN2v_cos", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "cosh", "_ZGVnN2v_cosh", FIXED(2))
+TLI_DEFINE_VECFUNC( "cosh", "_ZGVnN2v_cosh", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "exp", "_ZGVnN2v_exp", FIXED(2))
-TLI_DEFINE_VECFUNC( "llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
+TLI_DEFINE_VECFUNC( "exp", "_ZGVnN2v_exp", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC( "llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "exp2", "_ZGVnN2v_exp2", FIXED(2))
-TLI_DEFINE_VECFUNC( "llvm.exp2.f64", "_ZGVnN2v_exp2", FIXED(2))
+TLI_DEFINE_VECFUNC( "exp2", "_ZGVnN2v_exp2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC( "llvm.exp2.f64", "_ZGVnN2v_exp2", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "exp10", "_ZGVnN2v_exp10", FIXED(2))
+TLI_DEFINE_VECFUNC( "exp10", "_ZGVnN2v_exp10", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "lgamma", "_ZGVnN2v_lgamma", FIXED(2))
+TLI_DEFINE_VECFUNC( "lgamma", "_ZGVnN2v_lgamma", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "log", "_ZGVnN2v_log", FIXED(2))
-TLI_DEFINE_VECFUNC( "llvm.log.f64", "_ZGVnN2v_log", FIXED(2))
+TLI_DEFINE_VECFUNC( "log", "_ZGVnN2v_log", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC( "llvm.log.f64", "_ZGVnN2v_log", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "log2", "_ZGVnN2v_log2", FIXED(2))
-TLI_DEFINE_VECFUNC( "llvm.log2.f64", "_ZGVnN2v_log2", FIXED(2))
+TLI_DEFINE_VECFUNC( "log2", "_ZGVnN2v_log2", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC( "llvm.log2.f64", "_ZGVnN2v_log2", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "log10", "_ZGVnN2v_log10", FIXED(2))
-TLI_DEFINE_VECFUNC( "llvm.log10.f64", "_ZGVnN2v_log10", FIXED(2))
+TLI_DEFINE_VECFUNC( "log10", "_ZGVnN2v_log10", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC( "llvm.log10.f64", "_ZGVnN2v_log10", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "pow", "_ZGVnN2vv_pow", FIXED(2))
-TLI_DEFINE_VECFUNC( "llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
+TLI_DEFINE_VECFUNC( "pow", "_ZGVnN2vv_pow", FIXED(2), "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC( "llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2), "_ZGV_LLVM_N2vv")
 
-TLI_DEFINE_VECFUNC( "sin", "_ZGVnN2v_sin", FIXED(2))
-TLI_DEFINE_VECFUNC( "llvm.sin.f64", "_ZGVnN2v_sin", FIXED(2))
+TLI_DEFINE_VECFUNC( "sin", "_ZGVnN2v_sin", FIXED(2), "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC( "llvm.sin.f64", "_ZGVnN2v_sin", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "sinh", "_ZGVnN2v_sinh", FIXED(2))
+TLI_DEFINE_VECFUNC( "sinh", "_ZGVnN2v_sinh", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "sqrt", "_ZGVnN2v_sqrt", FIXED(2))
+TLI_DEFINE_VECFUNC( "sqrt", "_ZGVnN2v_sqrt", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "tan", "_ZGVnN2v_tan", FIXED(2))
+TLI_DEFINE_VECFUNC( "tan", "_ZGVnN2v_tan", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "tanh", "_ZGVnN2v_tanh", FIXED(2))
+TLI_DEFINE_VECFUNC( "tanh", "_ZGVnN2v_tanh", FIXED(2), "_ZGV_LLVM_N2v")
 
-TLI_DEFINE_VECFUNC( "tgamma", "_ZGVnN2v_tgamma", FIXED(2))
+TLI_DEFINE_VECFUNC( "tgamma", "_ZGVnN2v_tgamma", FIXED(2), "_ZGV_LLVM_N2v")
 
 #elif defined(TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS)
 
-TLI_DEFINE_VECFUNC( "acosf", "_ZGVnN4v_acosf", FIXED(4))
+TLI_DEFINE_VECFUNC( "acosf", "_ZGVnN4v_acosf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "asinf", "_ZGVnN4v_asinf", FIXED(4))
+TLI_DEFINE_VECFUNC( "asinf", "_ZGVnN4v_asinf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "atanf", "_ZGVnN4v_atanf", FIXED(4))
+TLI_DEFINE_VECFUNC( "atanf", "_ZGVnN4v_atanf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "atan2f", "_ZGVnN4vv_atan2f", FIXED(4))
+TLI_DEFINE_VECFUNC( "atan2f", "_ZGVnN4vv_atan2f", FIXED(4), "_ZGV_LLVM_N4vv")
 
-TLI_DEFINE_VECFUNC( "atanhf", "_ZGVnN4v_atanhf", FIXED(4))
+TLI_DEFINE_VECFUNC( "atanhf", "_ZGVnN4v_atanhf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "cosf", "_ZGVnN4v_cosf", FIXED(4))
-TLI_DEFINE_VECFUNC( "llvm.cos.f32", "_ZGVnN4v_cosf", FIXED(4))
+TLI_DEFINE_VECFUNC( "cosf", "_ZGVnN4v_cosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC( "llvm.cos.f32", "_ZGVnN4v_cosf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "coshf", "_ZGVnN4v_coshf", FIXED(4))
+TLI_DEFINE_VECFUNC( "coshf", "_ZGVnN4v_coshf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "expf", "_ZGVnN4v_expf", FIXED(4))
-TLI_DEFINE_VECFUNC( "llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4))
+TLI_DEFINE_VECFUNC( "expf", "_ZGVnN4v_expf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC( "llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "exp2f", "_ZGVnN4v_exp2f", FIXED(4))
-TLI_DEFINE_VECFUNC( "llvm.exp2.f32", "_ZGVnN4v_exp2f", FIXED(4))
+TLI_DEFINE_VECFUNC( "exp2f", "_ZGVnN4v_exp2f", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC( "llvm.exp2.f32", "_ZGVnN4v_exp2f", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "exp10f", "_ZGVnN4v_exp10f", FIXED(4))
+TLI_DEFINE_VECFUNC( "exp10f", "_ZGVnN4v_exp10f", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "lgammaf", "_ZGVnN4v_lgammaf", FIXED(4))
+TLI_DEFINE_VECFUNC( "lgammaf", "_ZGVnN4v_lgammaf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "logf", "_ZGVnN4v_logf", FIXED(4))
-TLI_DEFINE_VECFUNC( "llvm.log.f32", "_ZGVnN4v_logf", FIXED(4))
+TLI_DEFINE_VECFUNC( "logf", "_ZGVnN4v_logf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC( "llvm.log.f32", "_ZGVnN4v_logf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "log2f", "_ZGVnN4v_log2f", FIXED(4))
-TLI_DEFINE_VECFUNC( "llvm.log2.f32", "_ZGVnN4v_log2f", FIXED(4))
+TLI_DEFINE_VECFUNC( "log2f", "_ZGVnN4v_log2f", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC( "llvm.log2.f32", "_ZGVnN4v_log2f", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "log10f", "_ZGVnN4v_log10f", FIXED(4))
-TLI_DEFINE_VECFUNC( "llvm.log10.f32", "_ZGVnN4v_log10f", FIXED(4))
+TLI_DEFINE_VECFUNC( "log10f", "_ZGVnN4v_log10f", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC( "llvm.log10.f32", "_ZGVnN4v_log10f", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "powf", "_ZGVnN4vv_powf", FIXED(4))
-TLI_DEFINE_VECFUNC( "llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
+TLI_DEFINE_VECFUNC( "powf", "_ZGVnN4vv_powf", FIXED(4), "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC( "llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4), "_ZGV_LLVM_N4vv")
 
-TLI_DEFINE_VECFUNC( "sinf", "_ZGVnN4v_sinf", FIXED(4))
-TLI_DEFINE_VECFUNC( "llvm.sin.f32", "_ZGVnN4v_sinf", FIXED(4))
+TLI_DEFINE_VECFUNC( "sinf", "_ZGVnN4v_sinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC( "llvm.sin.f32", "_ZGVnN4v_sinf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "sinhf", "_ZGVnN4v_sinhf", FIXED(4))
+TLI_DEFINE_VECFUNC( "sinhf", "_ZGVnN4v_sinhf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "sqrtf", "_ZGVnN4v_sqrtf", FIXED(4))
+TLI_DEFINE_VECFUNC( "sqrtf", "_ZGVnN4v_sqrtf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "tanf", "_ZGVnN4v_tanf", FIXED(4))
+TLI_DEFINE_VECFUNC( "tanf", "_ZGVnN4v_tanf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "tanhf", "_ZGVnN4v_tanhf", FIXED(4))
+TLI_DEFINE_VECFUNC( "tanhf", "_ZGVnN4v_tanhf", FIXED(4), "_ZGV_LLVM_N4v")
 
-TLI_DEFINE_VECFUNC( "tgammaf", "_ZGVnN4v_tgammaf", FIXED(4))
+TLI_DEFINE_VECFUNC( "tgammaf", "_ZGVnN4v_tgammaf", FIXED(4), "_ZGV_LLVM_N4v")
 
 #elif defined(TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS)
 
-TLI_DEFINE_VECFUNC("acos", "_ZGVsMxv_acos",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("acosf", "_ZGVsMxv_acosf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("acos", "_ZGVsMxv_acos",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("acosf", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("asin", "_ZGVsMxv_asin",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("asinf", "_ZGVsMxv_asinf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("asin", "_ZGVsMxv_asin",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("asinf", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("atan", "_ZGVsMxv_atan",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("atanf", "_ZGVsMxv_atanf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("atan", "_ZGVsMxv_atan",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("atanf", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("atan2", "_ZGVsMxvv_atan2",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("atan2f", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("atan2", "_ZGVsMxvv_atan2",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED, "_ZGVsMxvv")
 
-TLI_DEFINE_VECFUNC("atanh", "_ZGVsMxv_atanh",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("atanhf", "_ZGVsMxv_atanhf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("atanh", "_ZGVsMxv_atanh",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVsMxv_atanhf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("cos", "_ZGVsMxv_cos",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("cosf", "_ZGVsMxv_cosf", SCALABLE(4), MASKED)
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVsMxv_cos", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVsMxv_cosf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("cos", "_ZGVsMxv_cos",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("cosf", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVsMxv_cos", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("cosh", "_ZGVsMxv_cosh",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("coshf", "_ZGVsMxv_coshf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("cosh", "_ZGVsMxv_cosh",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("coshf", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("exp", "_ZGVsMxv_exp",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("expf", "_ZGVsMxv_expf", SCALABLE(4), MASKED)
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVsMxv_exp", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVsMxv_expf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("exp", "_ZGVsMxv_exp",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("expf", "_ZGVsMxv_expf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVsMxv_exp", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVsMxv_expf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("exp2", "_ZGVsMxv_exp2",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("exp2f", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED)
-TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVsMxv_exp2", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("exp2", "_ZGVsMxv_exp2",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVsMxv_exp2", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("exp10", "_ZGVsMxv_exp10",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("exp10f", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("exp10", "_ZGVsMxv_exp10",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("fmod", "_ZGVsMxvv_fmod", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("fmodf", "_ZGVsMxvv_fmodf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("fmod", "_ZGVsMxvv_fmod", SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("fmodf", "_ZGVsMxvv_fmodf", SCALABLE(4), MASKED, "_ZGVsMxvv")
 
-TLI_DEFINE_VECFUNC("lgamma", "_ZGVsMxv_lgamma",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("lgammaf", "_ZGVsMxv_lgammaf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("lgamma", "_ZGVsMxv_lgamma",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("lgammaf", "_ZGVsMxv_lgammaf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("log", "_ZGVsMxv_log",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("logf", "_ZGVsMxv_logf", SCALABLE(4), MASKED)
-TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVsMxv_log", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVsMxv_logf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("log", "_ZGVsMxv_log",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("logf", "_ZGVsMxv_logf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVsMxv_log", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVsMxv_logf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC( "log2", "_ZGVsMxv_log2", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC( "log2f", "_ZGVsMxv_log2f", SCALABLE(4), MASKED)
-TLI_DEFINE_VECFUNC( "llvm.log2.f64", "_ZGVsMxv_log2", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC( "llvm.log2.f32", "_ZGVsMxv_log2f", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC( "log2", "_ZGVsMxv_log2", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC( "log2f", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC( "llvm.log2.f64", "_ZGVsMxv_log2", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC( "llvm.log2.f32", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("log10", "_ZGVsMxv_log10",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("log10f", "_ZGVsMxv_log10f", SCALABLE(4), MASKED)
-TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVsMxv_log10", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVsMxv_log10f", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("log10", "_ZGVsMxv_log10",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("log10f", "_ZGVsMxv_log10f", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVsMxv_log10", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVsMxv_log10f", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("pow", "_ZGVsMxvv_pow", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("powf", "_ZGVsMxvv_powf", SCALABLE(4), MASKED)
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVsMxvv_pow", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVsMxvv_powf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("pow", "_ZGVsMxvv_pow", SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("powf", "_ZGVsMxvv_powf", SCALABLE(4), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVsMxvv_pow", SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVsMxvv_powf", SCALABLE(4), MASKED, "_ZGVsMxvv")
 
-TLI_DEFINE_VECFUNC("sin", "_ZGVsMxv_sin",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED)
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("sin", "_ZGVsMxv_sin",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("sqrt", "_ZGVsMxv_sqrt",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("sqrtf", "_ZGVsMxv_sqrtf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("sqrt", "_ZGVsMxv_sqrt",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("sqrtf", "_ZGVsMxv_sqrtf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("tan", "_ZGVsMxv_tan",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("tanf", "_ZGVsMxv_tanf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("tan", "_ZGVsMxv_tan",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("tanf", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("tgamma", "_ZGVsMxv_tgamma",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("tgammaf", "_ZGVsMxv_tgammaf", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("tgamma", "_ZGVsMxv_tgamma",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("tgammaf", "_ZGVsMxv_tgammaf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 #elif defined(TLI_DEFINE_ARMPL_VECFUNCS)
 
-TLI_DEFINE_VECFUNC("acos", "armpl_vacosq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("acosf", "armpl_vacosq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("acos", "armpl_svacos_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("acosf", "armpl_svacos_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("acosh", "armpl_vacoshq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("acoshf", "armpl_vacoshq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("acosh", "armpl_svacosh_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("acoshf", "armpl_svacosh_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("asin", "armpl_vasinq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("asinf", "armpl_vasinq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("asin", "armpl_svasin_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("asinf", "armpl_svasin_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("asinh", "armpl_vasinhq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("asinhf", "armpl_vasinhq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("asinh", "armpl_svasinh_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("asinhf", "armpl_svasinh_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("atan", "armpl_vatanq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("atanf", "armpl_vatanq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("atan", "armpl_svatan_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("atanf", "armpl_svatan_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("atan2", "armpl_vatan2q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("atan2f", "armpl_vatan2q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("atan2", "armpl_svatan2_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("atan2f", "armpl_svatan2_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("atanh", "armpl_vatanhq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("atanhf", "armpl_vatanhq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("atanh", "armpl_svatanh_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("atanhf", "armpl_svatanh_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("cbrt", "armpl_vcbrtq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("cbrtf", "armpl_vcbrtq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("cbrt", "armpl_svcbrt_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("cbrtf", "armpl_svcbrt_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("copysign", "armpl_vcopysignq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("copysignf", "armpl_vcopysignq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("copysign", "armpl_svcopysign_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("copysignf", "armpl_svcopysign_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("cos", "armpl_vcosq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("cosf", "armpl_vcosq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("cos", "armpl_svcos_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("cosf", "armpl_svcos_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "armpl_vcosq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "armpl_vcosq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.cos.f64", "armpl_svcos_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.cos.f32", "armpl_svcos_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("cosh", "armpl_vcoshq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("coshf", "armpl_vcoshq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("cosh", "armpl_svcosh_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("coshf", "armpl_svcosh_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("erf", "armpl_verfq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("erff", "armpl_verfq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("erf", "armpl_sverf_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("erff", "armpl_sverf_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("erfc", "armpl_verfcq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("erfcf", "armpl_verfcq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("erfc", "armpl_sverfc_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("erfcf", "armpl_sverfc_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("exp", "armpl_vexpq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("expf", "armpl_vexpq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("exp", "armpl_svexp_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("expf", "armpl_svexp_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "armpl_vexpq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "armpl_vexpq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.exp.f64", "armpl_svexp_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.exp.f32", "armpl_svexp_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("exp2", "armpl_vexp2q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("exp2f", "armpl_vexp2q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("exp2", "armpl_svexp2_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("exp2f", "armpl_svexp2_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("llvm.exp2.f64", "armpl_vexp2q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.exp2.f32", "armpl_vexp2q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.exp2.f64", "armpl_svexp2_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.exp2.f32", "armpl_svexp2_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("exp10", "armpl_vexp10q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("exp10f", "armpl_vexp10q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("exp10", "armpl_svexp10_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("exp10f", "armpl_svexp10_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("expm1", "armpl_vexpm1q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("expm1f", "armpl_vexpm1q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("expm1", "armpl_svexpm1_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("expm1f", "armpl_svexpm1_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("fdim", "armpl_vfdimq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("fdimf", "armpl_vfdimq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("fdim", "armpl_svfdim_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("fdimf", "armpl_svfdim_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("fma", "armpl_vfmaq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("fmaf", "armpl_vfmaq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("fma", "armpl_svfma_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("fmaf", "armpl_svfma_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("fmin", "armpl_vfminq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("fminf", "armpl_vfminq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("fmin", "armpl_svfmin_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("fminf", "armpl_svfmin_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("fmod", "armpl_vfmodq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("fmodf", "armpl_vfmodq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("fmod", "armpl_svfmod_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("fmodf", "armpl_svfmod_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("hypot", "armpl_vhypotq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("hypotf", "armpl_vhypotq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("hypot", "armpl_svhypot_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("hypotf", "armpl_svhypot_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("lgamma", "armpl_vlgammaq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("lgammaf", "armpl_vlgammaq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("lgamma", "armpl_svlgamma_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("lgammaf", "armpl_svlgamma_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("log", "armpl_vlogq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("logf", "armpl_vlogq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("log", "armpl_svlog_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("logf", "armpl_svlog_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("llvm.log.f64", "armpl_vlogq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.log.f32", "armpl_vlogq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.log.f64", "armpl_svlog_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.log.f32", "armpl_svlog_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("log1p", "armpl_vlog1pq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("log1pf", "armpl_vlog1pq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("log1p", "armpl_svlog1p_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("log1pf", "armpl_svlog1p_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("log2", "armpl_vlog2q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("log2f", "armpl_vlog2q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("log2", "armpl_svlog2_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("log2f", "armpl_svlog2_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("llvm.log2.f64", "armpl_vlog2q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_vlog2q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.log2.f64", "armpl_svlog2_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_svlog2_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("log10", "armpl_vlog10q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("log10f", "armpl_vlog10q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("log10", "armpl_svlog10_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("log10f", "armpl_svlog10_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("llvm.log10.f64", "armpl_vlog10q_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.log10.f32", "armpl_vlog10q_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.log10.f64", "armpl_svlog10_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.log10.f32", "armpl_svlog10_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("nextafter", "armpl_vnextafterq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("nextafterf", "armpl_vnextafterq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("nextafter", "armpl_svnextafter_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("nextafterf", "armpl_svnextafter_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("pow", "armpl_vpowq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("powf", "armpl_vpowq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("pow", "armpl_svpow_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("powf", "armpl_svpow_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "armpl_vpowq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "armpl_vpowq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.pow.f64", "armpl_svpow_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.pow.f32", "armpl_svpow_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("sin", "armpl_vsinq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("sinf", "armpl_vsinq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("sin", "armpl_svsin_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("sinf", "armpl_svsin_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "armpl_vsinq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_vsinq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("llvm.sin.f64", "armpl_svsin_f64_x", SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_svsin_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("sinh", "armpl_vsinhq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("sinhf", "armpl_vsinhq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("sinh", "armpl_svsinh_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("sinhf", "armpl_svsinh_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("sinpi", "armpl_vsinpiq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("sinpif", "armpl_vsinpiq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("sinpi", "armpl_svsinpi_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("sinpif", "armpl_svsinpi_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("sqrt", "armpl_vsqrtq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("sqrtf", "armpl_vsqrtq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("sqrt", "armpl_svsqrt_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("sqrtf", "armpl_svsqrt_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("tan", "armpl_vtanq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("tanf", "armpl_vtanq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("tan", "armpl_svtan_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("tanf", "armpl_svtan_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("tanh", "armpl_vtanhq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("tanhf", "armpl_vtanhq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("tanh", "armpl_svtanh_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("tanhf", "armpl_svtanh_f32_x", SCALABLE(4), MASKED)
-
-TLI_DEFINE_VECFUNC("tgamma", "armpl_vtgammaq_f64", FIXED(2), NOMASK)
-TLI_DEFINE_VECFUNC("tgammaf", "armpl_vtgammaq_f32", FIXED(4), NOMASK)
-TLI_DEFINE_VECFUNC("tgamma", "armpl_svtgamma_f64_x",  SCALABLE(2), MASKED)
-TLI_DEFINE_VECFUNC("tgammaf", "armpl_svtgamma_f32_x", SCALABLE(4), MASKED)
+TLI_DEFINE_VECFUNC("acos", "armpl_vacosq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("acosf", "armpl_vacosq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("acos", "armpl_svacos_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("acosf", "armpl_svacos_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("acosh", "armpl_vacoshq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("acoshf", "armpl_vacoshq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("acosh", "armpl_svacosh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("acoshf", "armpl_svacosh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("asin", "armpl_vasinq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("asinf", "armpl_vasinq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("asin", "armpl_svasin_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("asinf", "armpl_svasin_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("asinh", "armpl_vasinhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("asinhf", "armpl_vasinhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("asinh", "armpl_svasinh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("asinhf", "armpl_svasinh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("atan", "armpl_vatanq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("atanf", "armpl_vatanq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atan", "armpl_svatan_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("atanf", "armpl_svatan_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("atan2", "armpl_vatan2q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("atan2f", "armpl_vatan2q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("atan2", "armpl_svatan2_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("atan2f", "armpl_svatan2_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("atanh", "armpl_vatanhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("atanhf", "armpl_vatanhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("atanh", "armpl_svatanh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("atanhf", "armpl_svatanh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("cbrt", "armpl_vcbrtq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cbrtf", "armpl_vcbrtq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cbrt", "armpl_svcbrt_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("cbrtf", "armpl_svcbrt_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("copysign", "armpl_vcopysignq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("copysignf", "armpl_vcopysignq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("copysign", "armpl_svcopysign_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("copysignf", "armpl_svcopysign_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("cos", "armpl_vcosq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cosf", "armpl_vcosq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cos", "armpl_svcos_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("cosf", "armpl_svcos_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "armpl_vcosq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "armpl_vcosq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "armpl_svcos_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "armpl_svcos_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("cosh", "armpl_vcoshq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("coshf", "armpl_vcoshq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cosh", "armpl_svcosh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("coshf", "armpl_svcosh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("erf", "armpl_verfq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("erff", "armpl_verfq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("erf", "armpl_sverf_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("erff", "armpl_sverf_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("erfc", "armpl_verfcq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("erfcf", "armpl_verfcq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("erfc", "armpl_sverfc_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("erfcf", "armpl_sverfc_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("exp", "armpl_vexpq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("expf", "armpl_vexpq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("exp", "armpl_svexp_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("expf", "armpl_svexp_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "armpl_vexpq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "armpl_vexpq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "armpl_svexp_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "armpl_svexp_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("exp2", "armpl_vexp2q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("exp2f", "armpl_vexp2q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("exp2", "armpl_svexp2_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("exp2f", "armpl_svexp2_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "armpl_vexp2q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "armpl_vexp2q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "armpl_svexp2_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "armpl_svexp2_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("exp10", "armpl_vexp10q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("exp10f", "armpl_vexp10q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("exp10", "armpl_svexp10_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("exp10f", "armpl_svexp10_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("expm1", "armpl_vexpm1q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("expm1f", "armpl_vexpm1q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("expm1", "armpl_svexpm1_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("expm1f", "armpl_svexpm1_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("fdim", "armpl_vfdimq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("fdimf", "armpl_vfdimq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("fdim", "armpl_svfdim_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("fdimf", "armpl_svfdim_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("fma", "armpl_vfmaq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vvv")
+TLI_DEFINE_VECFUNC("fmaf", "armpl_vfmaq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vvv")
+TLI_DEFINE_VECFUNC("fma", "armpl_svfma_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvvv")
+TLI_DEFINE_VECFUNC("fmaf", "armpl_svfma_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvvv")
+
+TLI_DEFINE_VECFUNC("fmin", "armpl_vfminq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("fminf", "armpl_vfminq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("fmin", "armpl_svfmin_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("fminf", "armpl_svfmin_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("fmod", "armpl_vfmodq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("fmodf", "armpl_vfmodq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("fmod", "armpl_svfmod_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("fmodf", "armpl_svfmod_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("hypot", "armpl_vhypotq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("hypotf", "armpl_vhypotq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("hypot", "armpl_svhypot_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("hypotf", "armpl_svhypot_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("lgamma", "armpl_vlgammaq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("lgammaf", "armpl_vlgammaq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("lgamma", "armpl_svlgamma_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("lgammaf", "armpl_svlgamma_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("log", "armpl_vlogq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("logf", "armpl_vlogq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log", "armpl_svlog_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("logf", "armpl_svlog_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("llvm.log.f64", "armpl_vlogq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log.f32", "armpl_vlogq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log.f64", "armpl_svlog_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.log.f32", "armpl_svlog_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("log1p", "armpl_vlog1pq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log1pf", "armpl_vlog1pq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log1p", "armpl_svlog1p_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("log1pf", "armpl_svlog1p_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("log2", "armpl_vlog2q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log2f", "armpl_vlog2q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log2", "armpl_svlog2_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("log2f", "armpl_svlog2_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "armpl_vlog2q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_vlog2q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "armpl_svlog2_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_svlog2_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("log10", "armpl_vlog10q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("log10f", "armpl_vlog10q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("log10", "armpl_svlog10_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("log10f", "armpl_svlog10_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "armpl_vlog10q_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "armpl_vlog10q_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "armpl_svlog10_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "armpl_svlog10_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("nextafter", "armpl_vnextafterq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("nextafterf", "armpl_vnextafterq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("nextafter", "armpl_svnextafter_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("nextafterf", "armpl_svnextafter_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("pow", "armpl_vpowq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("powf", "armpl_vpowq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("pow", "armpl_svpow_f64_x", SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("powf", "armpl_svpow_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "armpl_vpowq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "armpl_vpowq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "armpl_svpow_f64_x", SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "armpl_svpow_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
+TLI_DEFINE_VECFUNC("sin", "armpl_vsinq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sinf", "armpl_vsinq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sin", "armpl_svsin_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("sinf", "armpl_svsin_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "armpl_vsinq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_vsinq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "armpl_svsin_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_svsin_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("sinh", "armpl_vsinhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sinhf", "armpl_vsinhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sinh", "armpl_svsinh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("sinhf", "armpl_svsinh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("sinpi", "armpl_vsinpiq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sinpif", "armpl_vsinpiq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sinpi", "armpl_svsinpi_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("sinpif", "armpl_svsinpi_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("sqrt", "armpl_vsqrtq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("sqrtf", "armpl_vsqrtq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sqrt", "armpl_svsqrt_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("sqrtf", "armpl_svsqrt_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("tan", "armpl_vtanq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("tanf", "armpl_vtanq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("tan", "armpl_svtan_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("tanf", "armpl_svtan_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("tanh", "armpl_vtanhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("tanhf", "armpl_vtanhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("tanh", "armpl_svtanh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("tanhf", "armpl_svtanh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("tgamma", "armpl_vtgammaq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("tgammaf", "armpl_vtgammaq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("tgamma", "armpl_svtgamma_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("tgammaf", "armpl_svtgamma_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
 #else
 #error "Must choose which vector library functions are to be defined."
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 98ae6e4a02158d..7947648aaddd4e 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -182,27 +182,6 @@ static constexpr char const *_LLVM_Scalarize_ = "_LLVM_Scalarize_";
 std::optional<VFInfo> tryDemangleForVFABI(StringRef MangledName,
                                           const Module &M);
 
-/// This routine mangles the given VectorName according to the LangRef
-/// specification for vector-function-abi-variant attribute and is specific to
-/// the TLI mappings. It is the responsibility of the caller to make sure that
-/// this is only used if all parameters in the vector function are vector type.
-/// This returned string holds scalar-to-vector mapping:
-///    _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>)
-///
-/// where:
-///
-/// <isa> = "_LLVM_"
-/// <mask> = "M" if masked, "N" if no mask.
-/// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor`
-///          field of the `VecDesc` struct. If the number of lanes is scalable
-///          then 'x' is printed instead.
-/// <vparams> = "v", as many as are the numArgs.
-/// <scalarname> = the name of the scalar function.
-/// <vectorname> = the name of the vector function.
-std::string mangleTLIVectorName(StringRef VectorName, StringRef ScalarName,
-                                unsigned numArgs, ElementCount VF,
-                                bool Masked = false);
-
 /// Retrieve the `VFParamKind` from a string token.
 VFParamKind getVFParamKindFromString(const StringRef Token);
 
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index f92071e32222e1..c3dcd568216b71 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -426,6 +426,74 @@ struct ResourceBindInfo : public v0::ResourceBindInfo {
 } // namespace v2
 } // namespace PSV
 
+#define COMPONENT_PRECISION(Val, Enum) Enum = Val,
+enum class SigMinPrecision : uint32_t {
+#include "DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<SigMinPrecision>> getSigMinPrecisions();
+
+#define D3D_SYSTEM_VALUE(Val, Enum) Enum = Val,
+enum class D3DSystemValue : uint32_t {
+#include "DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<D3DSystemValue>> getD3DSystemValues();
+
+#define COMPONENT_TYPE(Val, Enum) Enum = Val,
+enum class SigComponentType : uint32_t {
+#include "DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<SigComponentType>> getSigComponentTypes();
+
+struct ProgramSignatureHeader {
+  uint32_t ParamCount;
+  uint32_t FirstParamOffset;
+
+  void swapBytes() {
+    sys::swapByteOrder(ParamCount);
+    sys::swapByteOrder(FirstParamOffset);
+  }
+};
+
+struct ProgramSignatureElement {
+  uint32_t Stream;     // Stream index (parameters must appear in non-decreasing
+                       // stream order)
+  uint32_t NameOffset; // Offset from the start of the ProgramSignatureHeader to
+                       // the start of the null terminated string for the name.
+  uint32_t Index;      // Semantic Index
+  D3DSystemValue SystemValue; // Semantic type. Similar to PSV::SemanticKind.
+  SigComponentType CompType;  // Type of bits.
+  uint32_t Register;          // Register Index (row index)
+  uint8_t Mask;               // Mask (column allocation)
+
+  // The ExclusiveMask has a different meaning for input and output signatures.
+  // For an output signature, masked components of the output register are never
+  // written to.
+  // For an input signature, masked components of the input register are always
+  // read.
+  uint8_t ExclusiveMask;
+
+  uint16_t Unused;
+  SigMinPrecision MinPrecision; // Minimum precision of input/output data
+
+  void swapBytes() {
+    sys::swapByteOrder(Stream);
+    sys::swapByteOrder(NameOffset);
+    sys::swapByteOrder(Index);
+    sys::swapByteOrder(SystemValue);
+    sys::swapByteOrder(CompType);
+    sys::swapByteOrder(Register);
+    sys::swapByteOrder(Mask);
+    sys::swapByteOrder(ExclusiveMask);
+    sys::swapByteOrder(MinPrecision);
+  }
+};
+
+static_assert(sizeof(ProgramSignatureElement) == 32,
+              "ProgramSignatureElement is misaligned");
+
 } // namespace dxbc
 } // namespace llvm
 
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index e2cbad293e1625..87dd0a5cb6ba70 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -4,6 +4,9 @@ CONTAINER_PART(DXIL)
 CONTAINER_PART(SFI0)
 CONTAINER_PART(HASH)
 CONTAINER_PART(PSV0)
+CONTAINER_PART(ISG1)
+CONTAINER_PART(OSG1)
+CONTAINER_PART(PSG1)
 
 #undef CONTAINER_PART
 #endif 
@@ -101,6 +104,20 @@ COMPONENT_TYPE(9, Float64)
 #undef COMPONENT_TYPE
 #endif
 
+#ifdef COMPONENT_PRECISION
+
+COMPONENT_PRECISION(0, Default)
+COMPONENT_PRECISION(1, Float16)
+COMPONENT_PRECISION(2, Float2_8)
+COMPONENT_PRECISION(3, Reserved)
+COMPONENT_PRECISION(4, SInt16)
+COMPONENT_PRECISION(5, UInt16)
+COMPONENT_PRECISION(0xf0, Any16)
+COMPONENT_PRECISION(0xf1, Any10)
+
+#undef COMPONENT_PRECISION
+#endif
+
 #ifdef INTERPOLATION_MODE
 
 INTERPOLATION_MODE(0, Undefined)
@@ -115,3 +132,37 @@ INTERPOLATION_MODE(8, Invalid)
 
 #undef INTERPOLATION_MODE
 #endif
+
+#ifdef D3D_SYSTEM_VALUE
+
+D3D_SYSTEM_VALUE(0, Undefined)
+D3D_SYSTEM_VALUE(1, Position)
+D3D_SYSTEM_VALUE(2, ClipDistance)
+D3D_SYSTEM_VALUE(3, CullDistance)
+D3D_SYSTEM_VALUE(4, RenderTargetArrayIndex)
+D3D_SYSTEM_VALUE(5, ViewPortArrayIndex)
+D3D_SYSTEM_VALUE(6, VertexID)
+D3D_SYSTEM_VALUE(7, PrimitiveID)
+D3D_SYSTEM_VALUE(8, InstanceID)
+D3D_SYSTEM_VALUE(9, IsFrontFace)
+D3D_SYSTEM_VALUE(10, SampleIndex)
+D3D_SYSTEM_VALUE(11, FinalQuadEdgeTessfactor)
+D3D_SYSTEM_VALUE(12, FinalQuadInsideTessfactor)
+D3D_SYSTEM_VALUE(13, FinalTriEdgeTessfactor)
+D3D_SYSTEM_VALUE(14, FinalTriInsideTessfactor)
+D3D_SYSTEM_VALUE(15, FinalLineDetailTessfactor)
+D3D_SYSTEM_VALUE(16, FinalLineDensityTessfactor)
+D3D_SYSTEM_VALUE(23, Barycentrics)
+D3D_SYSTEM_VALUE(24, ShadingRate)
+D3D_SYSTEM_VALUE(25, CullPrimitive)
+D3D_SYSTEM_VALUE(64, Target)
+D3D_SYSTEM_VALUE(65, Depth)
+D3D_SYSTEM_VALUE(66, Coverage)
+D3D_SYSTEM_VALUE(67, DepthGE)
+D3D_SYSTEM_VALUE(68, DepthLE)
+D3D_SYSTEM_VALUE(69, StencilRef)
+D3D_SYSTEM_VALUE(70, InnerCoverage)
+
+#undef D3D_SYSTEM_VALUE
+
+#endif
diff --git a/llvm/include/llvm/BinaryFormat/Magic.h b/llvm/include/llvm/BinaryFormat/Magic.h
index 329c96f5c14c40..a28710dcdfaf2c 100644
--- a/llvm/include/llvm/BinaryFormat/Magic.h
+++ b/llvm/include/llvm/BinaryFormat/Magic.h
@@ -42,19 +42,21 @@ struct file_magic {
     macho_universal_binary,                   ///< Mach-O universal binary
     macho_file_set,                           ///< Mach-O file set binary
     minidump,                                 ///< Windows minidump file
-    coff_cl_gl_object,   ///< Microsoft cl.exe's intermediate code file
-    coff_object,         ///< COFF object file
-    coff_import_library, ///< COFF import library
-    pecoff_executable,   ///< PECOFF executable file
-    windows_resource,    ///< Windows compiled resource file (.res)
-    xcoff_object_32,     ///< 32-bit XCOFF object file
-    xcoff_object_64,     ///< 64-bit XCOFF object file
-    wasm_object,         ///< WebAssembly Object file
-    pdb,                 ///< Windows PDB debug info file
-    tapi_file,           ///< Text-based Dynamic Library Stub file
-    cuda_fatbinary,      ///< CUDA Fatbinary object file
-    offload_binary,      ///< LLVM offload object file
-    dxcontainer_object,  ///< DirectX container file
+    coff_cl_gl_object,         ///< Microsoft cl.exe's intermediate code file
+    coff_object,               ///< COFF object file
+    coff_import_library,       ///< COFF import library
+    pecoff_executable,         ///< PECOFF executable file
+    windows_resource,          ///< Windows compiled resource file (.res)
+    xcoff_object_32,           ///< 32-bit XCOFF object file
+    xcoff_object_64,           ///< 64-bit XCOFF object file
+    wasm_object,               ///< WebAssembly Object file
+    pdb,                       ///< Windows PDB debug info file
+    tapi_file,                 ///< Text-based Dynamic Library Stub file
+    cuda_fatbinary,            ///< CUDA Fatbinary object file
+    offload_binary,            ///< LLVM offload object file
+    dxcontainer_object,        ///< DirectX container file
+    offload_bundle,            ///< Clang offload bundle file
+    offload_bundle_compressed, ///< Compressed clang offload bundle file
   };
 
   bool is_object() const { return V != unknown; }
diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index 749d43b840d5c2..c7658cc7b7432b 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -412,6 +412,7 @@ const unsigned WASM_SYMBOL_EXPORTED = 0x20;
 const unsigned WASM_SYMBOL_EXPLICIT_NAME = 0x40;
 const unsigned WASM_SYMBOL_NO_STRIP = 0x80;
 const unsigned WASM_SYMBOL_TLS = 0x100;
+const unsigned WASM_SYMBOL_ABSOLUTE = 0x200;
 
 #define WASM_RELOC(name, value) name = value,
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c11d558a73e9d0..5d83a0d37dc3a1 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -935,31 +935,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                               ArrayRef<int> Mask,
                                               VectorType *Ty, int &Index,
                                               VectorType *&SubTy) const {
-    int Limit = Mask.size() * 2;
-    if (Mask.empty() ||
-        // Extra check required by isSingleSourceMaskImpl function (called by
-        // ShuffleVectorInst::isSingleSourceMask).
-        any_of(Mask, [Limit](int I) { return I >= Limit; }))
+    if (Mask.empty())
       return Kind;
+    int NumSrcElts = Ty->getElementCount().getKnownMinValue();
     switch (Kind) {
     case TTI::SK_PermuteSingleSrc:
-      if (ShuffleVectorInst::isReverseMask(Mask))
+      if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
         return TTI::SK_Reverse;
-      if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
+      if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
         return TTI::SK_Broadcast;
       break;
     case TTI::SK_PermuteTwoSrc: {
       int NumSubElts;
       if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
-                                 Mask, Mask.size(), NumSubElts, Index)) {
+                                 Mask, NumSrcElts, NumSubElts, Index)) {
+        if (Index + NumSubElts > NumSrcElts)
+          return Kind;
         SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
         return TTI::SK_InsertSubvector;
       }
-      if (ShuffleVectorInst::isSelectMask(Mask))
+      if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
         return TTI::SK_Select;
-      if (ShuffleVectorInst::isTransposeMask(Mask))
+      if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
         return TTI::SK_Transpose;
-      if (ShuffleVectorInst::isSpliceMask(Mask, Index))
+      if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
         return TTI::SK_Splice;
       break;
     }
@@ -1687,6 +1686,62 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     }
     }
 
+    // VP Intrinsics should have the same cost as their non-vp counterpart.
+    // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
+    // counterpart when the vector length argument is smaller than the maximum
+    // vector length.
+    if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
+      std::optional<unsigned> FOp =
+          VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
+      if (FOp) {
+        // TODO: Support other kinds of Intrinsics (i.e. reductions)
+        if (ICA.getID() == Intrinsic::vp_load) {
+          Align Alignment;
+          if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+            Alignment = VPI->getPointerAlignment().valueOrOne();
+          unsigned AS = 0;
+          if (ICA.getArgs().size() > 1)
+            if (auto *PtrTy =
+                    dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
+              AS = PtrTy->getAddressSpace();
+          return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
+                                          AS, CostKind);
+        }
+        if (ICA.getID() == Intrinsic::vp_store) {
+          Align Alignment;
+          if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+            Alignment = VPI->getPointerAlignment().valueOrOne();
+          unsigned AS = 0;
+          if (ICA.getArgs().size() >= 2)
+            if (auto *PtrTy =
+                    dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
+              AS = PtrTy->getAddressSpace();
+          return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
+                                          AS, CostKind);
+        }
+        if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
+          return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
+                                                 CostKind);
+        }
+      }
+
+      std::optional<Intrinsic::ID> FID =
+          VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
+      if (FID) {
+        // Non-vp version will have same Args/Tys except mask and vector length.
+        assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
+               "Expected VPIntrinsic to have Mask and Vector Length args and "
+               "types");
+        ArrayRef<const Value *> NewArgs = ArrayRef(ICA.getArgs()).drop_back(2);
+        ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
+
+        IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewArgs,
+                                       NewTys, ICA.getFlags(), ICA.getInst(),
+                                       ICA.getScalarizationCost());
+        return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
+      }
+    }
+
     // Assume that we need to scalarize this intrinsic.
     // Compute the scalarization overhead based on Args for a vector
     // intrinsic.
diff --git a/llvm/include/llvm/CodeGen/ByteProvider.h b/llvm/include/llvm/CodeGen/ByteProvider.h
index 3187b4e68c56f3..adfa59f14b1dc9 100644
--- a/llvm/include/llvm/CodeGen/ByteProvider.h
+++ b/llvm/include/llvm/CodeGen/ByteProvider.h
@@ -32,6 +32,11 @@ template <typename ISelOp> class ByteProvider {
   ByteProvider(std::optional<ISelOp> Src, int64_t DestOffset, int64_t SrcOffset)
       : Src(Src), DestOffset(DestOffset), SrcOffset(SrcOffset) {}
 
+  ByteProvider(std::optional<ISelOp> Src, int64_t DestOffset, int64_t SrcOffset,
+               std::optional<bool> IsSigned)
+      : Src(Src), DestOffset(DestOffset), SrcOffset(SrcOffset),
+        IsSigned(IsSigned) {}
+
   // TODO -- use constraint in c++20
   // Does this type correspond with an operation in selection DAG
   template <typename T> class is_op {
@@ -61,6 +66,9 @@ template <typename ISelOp> class ByteProvider {
   // DestOffset
   int64_t SrcOffset = 0;
 
+  // Whether or not the path to this Src involved signed extensions
+  std::optional<bool> IsSigned;
+
   ByteProvider() = default;
 
   static ByteProvider getSrc(std::optional<ISelOp> Val, int64_t ByteOffset,
@@ -70,6 +78,14 @@ template <typename ISelOp> class ByteProvider {
     return ByteProvider(Val, ByteOffset, VectorOffset);
   }
 
+  static ByteProvider getSrc(std::optional<ISelOp> Val, int64_t ByteOffset,
+                             int64_t VectorOffset,
+                             std::optional<bool> IsSigned) {
+    static_assert(is_op<ISelOp>().value,
+                  "ByteProviders must contain an operation in selection DAG.");
+    return ByteProvider(Val, ByteOffset, VectorOffset, IsSigned);
+  }
+
   static ByteProvider getConstantZero() {
     return ByteProvider<ISelOp>(std::nullopt, 0, 0);
   }
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 732d6d0211c996..918e69352effe7 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -28,6 +28,7 @@ class Module;
 class AttrBuilder;
 class Function;
 class Triple;
+class TargetMachine;
 
 namespace codegen {
 
@@ -186,6 +187,14 @@ void setFunctionAttributes(StringRef CPU, StringRef Features, Module &M);
 /// Should value-tracking variable locations / instruction referencing be
 /// enabled by default for this triple?
 bool getDefaultValueTrackingVariableLocations(const llvm::Triple &T);
+
+/// Creates a TargetMachine instance with the options defined on the command
+/// line. This can be used for tools that do not need further customization of
+/// the TargetOptions.
+Expected<std::unique_ptr<TargetMachine>> createTargetMachineForTriple(
+    StringRef TargetTriple,
+    CodeGenOptLevel OptLevel = CodeGenOptLevel::Default);
+
 } // namespace codegen
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h b/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
index 3f85d22be53399..e4d9ff522b5a91 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
@@ -440,7 +440,7 @@ class RegBankSelect : public MachineFunctionPass {
   public:
     /// Create a MappingCost assuming that most of the instructions
     /// will occur in a basic block with \p LocalFreq frequency.
-    MappingCost(const BlockFrequency &LocalFreq);
+    MappingCost(BlockFrequency LocalFreq);
 
     /// Add \p Cost to the local cost.
     /// \return true if this cost is saturated, false otherwise.
diff --git a/llvm/include/llvm/CodeGen/MBFIWrapper.h b/llvm/include/llvm/CodeGen/MBFIWrapper.h
index 714ecc5d4334e4..60917b4e923fca 100644
--- a/llvm/include/llvm/CodeGen/MBFIWrapper.h
+++ b/llvm/include/llvm/CodeGen/MBFIWrapper.h
@@ -16,7 +16,6 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/raw_ostream.h"
 #include <optional>
 
 namespace llvm {
@@ -33,15 +32,11 @@ class MBFIWrapper {
   std::optional<uint64_t>
   getBlockProfileCount(const MachineBasicBlock *MBB) const;
 
-  raw_ostream &printBlockFreq(raw_ostream &OS,
-                              const MachineBasicBlock *MBB) const;
-  raw_ostream &printBlockFreq(raw_ostream &OS,
-                              const BlockFrequency Freq) const;
   void view(const Twine &Name, bool isSimple = true);
-  uint64_t getEntryFreq() const;
-  const MachineBlockFrequencyInfo &getMBFI() { return MBFI; }
+  BlockFrequency getEntryFreq() const;
+  const MachineBlockFrequencyInfo &getMBFI() const { return MBFI; }
 
- private:
+private:
   const MachineBlockFrequencyInfo &MBFI;
   DenseMap<const MachineBasicBlock *, BlockFrequency> MergedBBFreq;
 };
diff --git a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
index 1152eefed6e45c..3a85bb4ac9f7dd 100644
--- a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
@@ -66,14 +66,15 @@ class MachineBlockFrequencyInfo : public MachineFunctionPass {
   /// Compute the frequency of the block, relative to the entry block.
   /// This API assumes getEntryFreq() is non-zero.
   double getBlockFreqRelativeToEntryBlock(const MachineBasicBlock *MBB) const {
-    assert(getEntryFreq() != 0 && "getEntryFreq() should not return 0 here!");
+    assert(getEntryFreq() != BlockFrequency(0) &&
+           "getEntryFreq() should not return 0 here!");
     return static_cast<double>(getBlockFreq(MBB).getFrequency()) /
-           static_cast<double>(getEntryFreq());
+           static_cast<double>(getEntryFreq().getFrequency());
   }
 
   std::optional<uint64_t>
   getBlockProfileCount(const MachineBasicBlock *MBB) const;
-  std::optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
+  std::optional<uint64_t> getProfileCountFromFreq(BlockFrequency Freq) const;
 
   bool isIrrLoopHeader(const MachineBasicBlock *MBB) const;
 
@@ -90,20 +91,22 @@ class MachineBlockFrequencyInfo : public MachineFunctionPass {
   /// rendered using dot.
   void view(const Twine &Name, bool isSimple = true) const;
 
-  // Print the block frequency Freq to OS using the current functions entry
-  // frequency to convert freq into a relative decimal form.
-  raw_ostream &printBlockFreq(raw_ostream &OS, const BlockFrequency Freq) const;
-
-  // Convenience method that attempts to look up the frequency associated with
-  // BB and print it to OS.
-  raw_ostream &printBlockFreq(raw_ostream &OS,
-                              const MachineBasicBlock *MBB) const;
-
   /// Divide a block's BlockFrequency::getFrequency() value by this value to
   /// obtain the entry block - relative frequency of said block.
-  uint64_t getEntryFreq() const;
+  BlockFrequency getEntryFreq() const;
 };
 
+/// Print the block frequency @p Freq relative to the current functions entry
+/// frequency. Returns a Printable object that can be piped via `<<` to a
+/// `raw_ostream`.
+Printable printBlockFreq(const MachineBlockFrequencyInfo &MBFI,
+                         BlockFrequency Freq);
+
+/// Convenience function equivalent to calling
+/// `printBlockFreq(MBFI, MBFI.getBlockFreq(&MBB))`.
+Printable printBlockFreq(const MachineBlockFrequencyInfo &MBFI,
+                         const MachineBasicBlock &MBB);
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINEBLOCKFREQUENCYINFO_H
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 6c2da626ea54b4..8f1651c2958e59 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -266,7 +266,7 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   // RegInfo - Information about each register in use in the function.
   MachineRegisterInfo *RegInfo;
 
-  // Used to keep track of target-specific per-machine function information for
+  // Used to keep track of target-specific per-machine-function information for
   // the target implementation.
   MachineFunctionInfo *MFInfo;
 
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 03fb15f77c65cb..8367f999fcc76d 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1276,7 +1276,7 @@ class MachineInstr
   /// eraseFromBundle() to erase individual bundled instructions.
   void eraseFromParent();
 
-  /// Unlink 'this' form its basic block and delete it.
+  /// Unlink 'this' from its basic block and delete it.
   ///
   /// If the instruction is part of a bundle, the other instructions in the
   /// bundle remain bundled.
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
index b8238db5325daf..8cbd9ed2448f54 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
@@ -86,7 +86,7 @@ template <typename MachOTraits> class MachOBuilder {
   using StringTable = std::vector<StringTableEntry>;
 
   static bool swapStruct() {
-    return MachOTraits::Endianness != support::endian::system_endianness();
+    return MachOTraits::Endianness != llvm::endianness::native;
   }
 
 public:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index a9bfa2fe829363..f8b3b0c7524979 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -450,6 +450,10 @@ def OMPC_OMPX_Attribute : Clause<"ompx_attribute"> {
   let clangClass = "OMPXAttributeClause";
 }
 
+def OMPC_OMX_Bare : Clause<"ompx_bare"> {
+  let clangClass = "OMPXBareClause";
+}
+
 //===----------------------------------------------------------------------===//
 // Definition of OpenMP directives
 //===----------------------------------------------------------------------===//
@@ -1520,6 +1524,7 @@ def OMP_TargetTeams : Directive<"target teams"> {
     VersionedClause<OMPC_NumTeams>,
     VersionedClause<OMPC_ThreadLimit>,
     VersionedClause<OMPC_OMPX_DynCGroupMem>,
+    VersionedClause<OMPC_OMX_Bare>,
   ];
 }
 def OMP_TargetTeamsDistribute : Directive<"target teams distribute"> {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 1699ed3aeab766..75da461cfd8d95 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -877,6 +877,30 @@ class OpenMPIRBuilder {
       std::function<GlobalValue::LinkageTypes()> VariableLinkage,
       Type *LlvmPtrTy, Constant *Addr);
 
+  /// Get the offset of the OMP_MAP_MEMBER_OF field.
+  unsigned getFlagMemberOffset();
+
+  /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
+  /// the position given.
+  /// \param Position - A value indicating the position of the parent
+  /// of the member in the kernel argument structure, often retrieved
+  /// by the parents position in the combined information vectors used
+  /// to generate the structure itself. Multiple children (member's of)
+  /// with the same parent will use the same returned member flag.
+  omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);
+
+  /// Given an initial flag set, this function modifies it to contain
+  /// the passed in MemberOfFlag generated from the getMemberOfFlag
+  /// function. The results are dependent on the existing flag bits
+  /// set in the original flag set.
+  /// \param Flags - The original set of flags to be modified with the
+  /// passed in MemberOfFlag.
+  /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
+  /// slightly based on the getMemberOfFlag which adjusts the flag bits
+  /// based on the members position in its parent.
+  void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
+                              omp::OpenMPOffloadMappingFlags MemberOfFlag);
+
 private:
   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
   ///
diff --git a/llvm/include/llvm/IR/ConstantFolder.h b/llvm/include/llvm/IR/ConstantFolder.h
index 30c8b598bda6f9..c2b30a65e32e25 100644
--- a/llvm/include/llvm/IR/ConstantFolder.h
+++ b/llvm/include/llvm/IR/ConstantFolder.h
@@ -175,8 +175,11 @@ class ConstantFolder final : public IRBuilderFolder {
 
   Value *FoldCast(Instruction::CastOps Op, Value *V,
                   Type *DestTy) const override {
-    if (auto *C = dyn_cast<Constant>(V))
-      return ConstantExpr::getCast(Op, C, DestTy);
+    if (auto *C = dyn_cast<Constant>(V)) {
+      if (ConstantExpr::isDesirableCastOp(Op))
+        return ConstantExpr::getCast(Op, C, DestTy);
+      return ConstantFoldCastInstruction(Op, C, DestTy);
+    }
     return nullptr;
   }
 
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 710a9142d5f720..afa1aefb44aa1a 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -1330,6 +1330,9 @@ class ConstantExpr : public Constant {
   /// supported.
   static bool isSupportedBinOp(unsigned Opcode);
 
+  /// Whether creating a constant expression for this cast is desirable.
+  static bool isDesirableCastOp(unsigned Opcode);
+
   /// Whether creating a constant expression for this getelementptr type is
   /// supported.
   static bool isSupportedGetElementPtr(const Type *SrcElemTy) {
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index a0c8406fe4ec1e..c85727ce30a94a 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -2068,13 +2068,14 @@ class ShuffleVectorInst : public Instruction {
   /// Return true if this shuffle mask chooses elements from exactly one source
   /// vector.
   /// Example: <7,5,undef,7>
-  /// This assumes that vector operands are the same length as the mask.
-  static bool isSingleSourceMask(ArrayRef<int> Mask);
-  static bool isSingleSourceMask(const Constant *Mask) {
+  /// This assumes that vector operands (of length \p NumSrcElts) are the same
+  /// length as the mask.
+  static bool isSingleSourceMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isSingleSourceMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isSingleSourceMask(MaskAsInts);
+    return isSingleSourceMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle chooses elements from exactly one source
@@ -2082,7 +2083,8 @@ class ShuffleVectorInst : public Instruction {
   /// Example: shufflevector <4 x n> A, <4 x n> B, <3,0,undef,3>
   /// TODO: Optionally allow length-changing shuffles.
   bool isSingleSource() const {
-    return !changesLength() && isSingleSourceMask(ShuffleMask);
+    return !changesLength() &&
+           isSingleSourceMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask chooses elements from exactly one source
@@ -2090,8 +2092,8 @@ class ShuffleVectorInst : public Instruction {
   /// necessarily a no-op because it may change the number of elements from its
   /// input vectors or it may provide demanded bits knowledge via undef lanes.
   /// Example: <undef,undef,2,3>
-  static bool isIdentityMask(ArrayRef<int> Mask);
-  static bool isIdentityMask(const Constant *Mask) {
+  static bool isIdentityMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isIdentityMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
 
     // Not possible to express a shuffle mask for a scalable vector for this
@@ -2101,7 +2103,7 @@ class ShuffleVectorInst : public Instruction {
 
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isIdentityMask(MaskAsInts);
+    return isIdentityMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle chooses elements from exactly one source
@@ -2114,7 +2116,7 @@ class ShuffleVectorInst : public Instruction {
     if (isa<ScalableVectorType>(getType()))
       return false;
 
-    return !changesLength() && isIdentityMask(ShuffleMask);
+    return !changesLength() && isIdentityMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle lengthens exactly one source vector with
@@ -2138,12 +2140,12 @@ class ShuffleVectorInst : public Instruction {
   /// In that case, the shuffle is better classified as an identity shuffle.
   /// This assumes that vector operands are the same length as the mask
   /// (a length-changing shuffle can never be equivalent to a vector select).
-  static bool isSelectMask(ArrayRef<int> Mask);
-  static bool isSelectMask(const Constant *Mask) {
+  static bool isSelectMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isSelectMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isSelectMask(MaskAsInts);
+    return isSelectMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle chooses elements from its source vectors
@@ -2155,19 +2157,20 @@ class ShuffleVectorInst : public Instruction {
   /// In that case, the shuffle is better classified as an identity shuffle.
   /// TODO: Optionally allow length-changing shuffles.
   bool isSelect() const {
-    return !changesLength() && isSelectMask(ShuffleMask);
+    return !changesLength() && isSelectMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask swaps the order of elements from exactly
   /// one source vector.
   /// Example: <7,6,undef,4>
-  /// This assumes that vector operands are the same length as the mask.
-  static bool isReverseMask(ArrayRef<int> Mask);
-  static bool isReverseMask(const Constant *Mask) {
+  /// This assumes that vector operands (of length \p NumSrcElts) are the same
+  /// length as the mask.
+  static bool isReverseMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isReverseMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isReverseMask(MaskAsInts);
+    return isReverseMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle swaps the order of elements from exactly
@@ -2175,19 +2178,20 @@ class ShuffleVectorInst : public Instruction {
   /// Example: shufflevector <4 x n> A, <4 x n> B, <3,undef,1,undef>
   /// TODO: Optionally allow length-changing shuffles.
   bool isReverse() const {
-    return !changesLength() && isReverseMask(ShuffleMask);
+    return !changesLength() && isReverseMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask chooses all elements with the same value
   /// as the first element of exactly one source vector.
   /// Example: <4,undef,undef,4>
-  /// This assumes that vector operands are the same length as the mask.
-  static bool isZeroEltSplatMask(ArrayRef<int> Mask);
-  static bool isZeroEltSplatMask(const Constant *Mask) {
+  /// This assumes that vector operands (of length \p NumSrcElts) are the same
+  /// length as the mask.
+  static bool isZeroEltSplatMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isZeroEltSplatMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isZeroEltSplatMask(MaskAsInts);
+    return isZeroEltSplatMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if all elements of this shuffle are the same value as the
@@ -2197,7 +2201,8 @@ class ShuffleVectorInst : public Instruction {
   /// TODO: Optionally allow length-changing shuffles.
   /// TODO: Optionally allow splats from other elements.
   bool isZeroEltSplat() const {
-    return !changesLength() && isZeroEltSplatMask(ShuffleMask);
+    return !changesLength() &&
+           isZeroEltSplatMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask is a transpose mask.
@@ -2232,12 +2237,12 @@ class ShuffleVectorInst : public Instruction {
   ///   ; Transposed matrix
   ///   t0 = < a, e, c, g > = shufflevector m0, m1 < 0, 4, 2, 6 >
   ///   t1 = < b, f, d, h > = shufflevector m0, m1 < 1, 5, 3, 7 >
-  static bool isTransposeMask(ArrayRef<int> Mask);
-  static bool isTransposeMask(const Constant *Mask) {
+  static bool isTransposeMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isTransposeMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isTransposeMask(MaskAsInts);
+    return isTransposeMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle transposes the elements of its inputs without
@@ -2246,19 +2251,21 @@ class ShuffleVectorInst : public Instruction {
   /// exact specification.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <0,4,2,6>
   bool isTranspose() const {
-    return !changesLength() && isTransposeMask(ShuffleMask);
+    return !changesLength() && isTransposeMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask is a splice mask, concatenating the two
   /// inputs together and then extracts an original width vector starting from
   /// the splice index.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <1,2,3,4>
-  static bool isSpliceMask(ArrayRef<int> Mask, int &Index);
-  static bool isSpliceMask(const Constant *Mask, int &Index) {
+  /// This assumes that vector operands (of length \p NumSrcElts) are the same
+  /// length as the mask.
+  static bool isSpliceMask(ArrayRef<int> Mask, int NumSrcElts, int &Index);
+  static bool isSpliceMask(const Constant *Mask, int NumSrcElts, int &Index) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isSpliceMask(MaskAsInts, Index);
+    return isSpliceMask(MaskAsInts, NumSrcElts, Index);
   }
 
   /// Return true if this shuffle splices two inputs without changing the length
@@ -2266,7 +2273,8 @@ class ShuffleVectorInst : public Instruction {
   /// then extracts an original width vector starting from the splice index.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <1,2,3,4>
   bool isSplice(int &Index) const {
-    return !changesLength() && isSpliceMask(ShuffleMask, Index);
+    return !changesLength() &&
+           isSpliceMask(ShuffleMask, ShuffleMask.size(), Index);
   }
 
   /// Return true if this shuffle mask is an extract subvector mask.
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index c12e899d58fa83..c38078cc6087ef 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -166,8 +166,8 @@ class Type {
   bool isPPC_FP128Ty() const { return getTypeID() == PPC_FP128TyID; }
 
   /// Return true if this is a well-behaved IEEE-like type, which has a IEEE
-  /// compatible layout as defined by isIEEE(), and does not have unnormal
-  /// values
+  /// compatible layout as defined by APFloat::isIEEE(), and does not have
+  /// non-IEEE values, such as x86_fp80's unnormal values.
   bool isIEEELikeFPTy() const {
     switch (getTypeID()) {
     case DoubleTyID:
diff --git a/llvm/include/llvm/MC/DXContainerPSVInfo.h b/llvm/include/llvm/MC/DXContainerPSVInfo.h
index 9b1892088142b6..7d21c18d252f1c 100644
--- a/llvm/include/llvm/MC/DXContainerPSVInfo.h
+++ b/llvm/include/llvm/MC/DXContainerPSVInfo.h
@@ -86,6 +86,34 @@ struct PSVRuntimeInfo {
   }
 };
 
+class Signature {
+  struct Parameter {
+    uint32_t Stream;
+    StringRef Name;
+    uint32_t Index;
+    dxbc::D3DSystemValue SystemValue;
+    dxbc::SigComponentType CompType;
+    uint32_t Register;
+    uint8_t Mask;
+    uint8_t ExclusiveMask;
+    dxbc::SigMinPrecision MinPrecision;
+  };
+
+  SmallVector<Parameter> Params;
+
+public:
+  void addParam(uint32_t Stream, StringRef Name, uint32_t Index,
+                dxbc::D3DSystemValue SystemValue,
+                dxbc::SigComponentType CompType, uint32_t Register,
+                uint8_t Mask, uint8_t ExclusiveMask,
+                dxbc::SigMinPrecision MinPrecision) {
+    Params.push_back(Parameter{Stream, Name, Index, SystemValue, CompType,
+                               Register, Mask, ExclusiveMask, MinPrecision});
+  }
+
+  void write(raw_ostream &OS);
+};
+
 } // namespace mcdxbc
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h
index 8f7ab59d7f3828..a7f18c79969803 100644
--- a/llvm/include/llvm/Object/DXContainer.h
+++ b/llvm/include/llvm/Object/DXContainer.h
@@ -27,8 +27,6 @@
 namespace llvm {
 namespace object {
 
-namespace DirectX {
-
 namespace detail {
 template <typename T>
 std::enable_if_t<std::is_arithmetic<T>::value, void> swapBytes(T &value) {
@@ -40,82 +38,85 @@ std::enable_if_t<std::is_class<T>::value, void> swapBytes(T &value) {
   value.swapBytes();
 }
 } // namespace detail
-class PSVRuntimeInfo {
 
-  // This class provides a view into the underlying resource array. The Resource
-  // data is little-endian encoded and may not be properly aligned to read
-  // directly from. The dereference operator creates a copy of the data and byte
-  // swaps it as appropriate.
-  template <typename T> struct ViewArray {
-    StringRef Data;
-    uint32_t Stride = sizeof(T); // size of each element in the list.
+// This class provides a view into the underlying resource array. The Resource
+// data is little-endian encoded and may not be properly aligned to read
+// directly from. The dereference operator creates a copy of the data and byte
+// swaps it as appropriate.
+template <typename T> struct ViewArray {
+  StringRef Data;
+  uint32_t Stride = sizeof(T); // size of each element in the list.
 
-    ViewArray() = default;
-    ViewArray(StringRef D, size_t S) : Data(D), Stride(S) {}
+  ViewArray() = default;
+  ViewArray(StringRef D, size_t S) : Data(D), Stride(S) {}
 
-    using value_type = T;
-    static constexpr uint32_t MaxStride() {
-      return static_cast<uint32_t>(sizeof(value_type));
-    }
+  using value_type = T;
+  static constexpr uint32_t MaxStride() {
+    return static_cast<uint32_t>(sizeof(value_type));
+  }
 
-    struct iterator {
-      StringRef Data;
-      uint32_t Stride; // size of each element in the list.
-      const char *Current;
-
-      iterator(const ViewArray &A, const char *C)
-          : Data(A.Data), Stride(A.Stride), Current(C) {}
-      iterator(const iterator &) = default;
-
-      value_type operator*() {
-        // Explicitly zero the structure so that unused fields are zeroed. It is
-        // up to the user to know if the fields are used by verifying the PSV
-        // version.
-        value_type Val;
-        std::memset(&Val, 0, sizeof(value_type));
-        if (Current >= Data.end())
-          return Val;
-        memcpy(static_cast<void *>(&Val), Current,
-               std::min(Stride, MaxStride()));
-        if (sys::IsBigEndianHost)
-          detail::swapBytes(Val);
+  struct iterator {
+    StringRef Data;
+    uint32_t Stride; // size of each element in the list.
+    const char *Current;
+
+    iterator(const ViewArray &A, const char *C)
+        : Data(A.Data), Stride(A.Stride), Current(C) {}
+    iterator(const iterator &) = default;
+
+    value_type operator*() {
+      // Explicitly zero the structure so that unused fields are zeroed. It is
+      // up to the user to know if the fields are used by verifying the PSV
+      // version.
+      value_type Val;
+      std::memset(&Val, 0, sizeof(value_type));
+      if (Current >= Data.end())
         return Val;
-      }
+      memcpy(static_cast<void *>(&Val), Current, std::min(Stride, MaxStride()));
+      if (sys::IsBigEndianHost)
+        detail::swapBytes(Val);
+      return Val;
+    }
 
-      iterator operator++() {
-        if (Current < Data.end())
-          Current += Stride;
-        return *this;
-      }
+    iterator operator++() {
+      if (Current < Data.end())
+        Current += Stride;
+      return *this;
+    }
 
-      iterator operator++(int) {
-        iterator Tmp = *this;
-        ++*this;
-        return Tmp;
-      }
+    iterator operator++(int) {
+      iterator Tmp = *this;
+      ++*this;
+      return Tmp;
+    }
 
-      iterator operator--() {
-        if (Current > Data.begin())
-          Current -= Stride;
-        return *this;
-      }
+    iterator operator--() {
+      if (Current > Data.begin())
+        Current -= Stride;
+      return *this;
+    }
+
+    iterator operator--(int) {
+      iterator Tmp = *this;
+      --*this;
+      return Tmp;
+    }
 
-      iterator operator--(int) {
-        iterator Tmp = *this;
-        --*this;
-        return Tmp;
-      }
+    bool operator==(const iterator I) { return I.Current == Current; }
+    bool operator!=(const iterator I) { return !(*this == I); }
+  };
 
-      bool operator==(const iterator I) { return I.Current == Current; }
-      bool operator!=(const iterator I) { return !(*this == I); }
-    };
+  iterator begin() const { return iterator(*this, Data.begin()); }
 
-    iterator begin() const { return iterator(*this, Data.begin()); }
+  iterator end() const { return iterator(*this, Data.end()); }
 
-    iterator end() const { return iterator(*this, Data.end()); }
+  size_t size() const { return Data.size() / Stride; }
 
-    size_t size() const { return Data.size() / Stride; }
-  };
+  bool isEmpty() const { return Data.empty(); }
+};
+
+namespace DirectX {
+class PSVRuntimeInfo {
 
   using ResourceArray = ViewArray<dxbc::PSV::v2::ResourceBindInfo>;
   using SigElementArray = ViewArray<dxbc::PSV::v0::SignatureElement>;
@@ -232,6 +233,36 @@ class PSVRuntimeInfo {
   }
 };
 
+class Signature {
+  ViewArray<dxbc::ProgramSignatureElement> Parameters;
+  uint32_t StringTableOffset;
+  StringRef StringTable;
+
+public:
+  ViewArray<dxbc::ProgramSignatureElement>::iterator begin() const {
+    return Parameters.begin();
+  }
+
+  ViewArray<dxbc::ProgramSignatureElement>::iterator end() const {
+    return Parameters.end();
+  }
+
+  StringRef getName(uint32_t Offset) const {
+    assert(Offset >= StringTableOffset &&
+           Offset < StringTableOffset + StringTable.size() &&
+           "Offset out of range.");
+    // Name offsets are from the start of the signature data, not from the start
+    // of the string table. The header encodes the start offset of the sting
+    // table, so we convert the offset here.
+    uint32_t TableOffset = Offset - StringTableOffset;
+    return StringTable.slice(TableOffset, StringTable.find('\0', TableOffset));
+  }
+
+  bool isEmpty() const { return Parameters.isEmpty(); }
+
+  Error initialize(StringRef Part);
+};
+
 } // namespace DirectX
 
 class DXContainer {
@@ -248,6 +279,9 @@ class DXContainer {
   std::optional<uint64_t> ShaderFlags;
   std::optional<dxbc::ShaderHash> Hash;
   std::optional<DirectX::PSVRuntimeInfo> PSVInfo;
+  DirectX::Signature InputSignature;
+  DirectX::Signature OutputSignature;
+  DirectX::Signature PatchConstantSignature;
 
   Error parseHeader();
   Error parsePartOffsets();
@@ -255,6 +289,7 @@ class DXContainer {
   Error parseShaderFlags(StringRef Part);
   Error parseHash(StringRef Part);
   Error parsePSVInfo(StringRef Part);
+  Error parseSignature(StringRef Part, DirectX::Signature &Array);
   friend class PartIterator;
 
 public:
@@ -340,6 +375,14 @@ class DXContainer {
   const std::optional<DirectX::PSVRuntimeInfo> &getPSVInfo() const {
     return PSVInfo;
   };
+
+  const DirectX::Signature &getInputSignature() const { return InputSignature; }
+  const DirectX::Signature &getOutputSignature() const {
+    return OutputSignature;
+  }
+  const DirectX::Signature &getPatchConstantSignature() const {
+    return PatchConstantSignature;
+  }
 };
 
 } // namespace object
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index a896b1881ff34a..66a6ac70bbea10 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -129,6 +129,22 @@ struct PSVInfo {
   PSVInfo(const dxbc::PSV::v2::RuntimeInfo *P);
 };
 
+struct SignatureParameter {
+  uint32_t Stream;
+  std::string Name;
+  uint32_t Index;
+  dxbc::D3DSystemValue SystemValue;
+  dxbc::SigComponentType CompType;
+  uint32_t Register;
+  uint8_t Mask;
+  uint8_t ExclusiveMask;
+  dxbc::SigMinPrecision MinPrecision;
+};
+
+struct Signature {
+  llvm::SmallVector<SignatureParameter> Parameters;
+};
+
 struct Part {
   Part() = default;
   Part(std::string N, uint32_t S) : Name(N), Size(S) {}
@@ -138,6 +154,7 @@ struct Part {
   std::optional<ShaderFlags> Flags;
   std::optional<ShaderHash> Hash;
   std::optional<PSVInfo> Info;
+  std::optional<DXContainerYAML::Signature> Signature;
 };
 
 struct Object {
@@ -152,9 +169,13 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::Part)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::ResourceBindInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureElement)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::PSVInfo::MaskVector)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureParameter)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::SemanticKind)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ComponentType)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::InterpolationMode)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType)
+LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision)
 
 namespace llvm {
 
@@ -202,6 +223,14 @@ template <> struct MappingTraits<DXContainerYAML::SignatureElement> {
   static void mapping(IO &IO, llvm::DXContainerYAML::SignatureElement &El);
 };
 
+template <> struct MappingTraits<DXContainerYAML::SignatureParameter> {
+  static void mapping(IO &IO, llvm::DXContainerYAML::SignatureParameter &El);
+};
+
+template <> struct MappingTraits<DXContainerYAML::Signature> {
+  static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El);
+};
+
 } // namespace yaml
 
 } // namespace llvm
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index b407fe277c543b..b2fdbc1936ef28 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -813,7 +813,7 @@ template <class FuncRecordTy, support::endianness Endian>
 Error getFuncNameViaRef(const FuncRecordTy *Record,
                         InstrProfSymtab &ProfileNames, StringRef &FuncName) {
   uint64_t NameRef = getFuncNameRef<FuncRecordTy, Endian>(Record);
-  FuncName = ProfileNames.getFuncName(NameRef);
+  FuncName = ProfileNames.getFuncOrVarName(NameRef);
   return Error::success();
 }
 
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index f9096b46157200..9239c1a691eca1 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -510,14 +510,14 @@ class InstrProfSymtab {
   /// an empty string.
   StringRef getFuncName(uint64_t FuncNameAddress, size_t NameSize);
 
-  /// Return function's PGO name from the name's md5 hash value.
-  /// If not found, return an empty string.
-  inline StringRef getFuncName(uint64_t FuncMD5Hash);
+  /// Return name of functions or global variables from the name's md5 hash
+  /// value. If not found, return an empty string.
+  inline StringRef getFuncOrVarName(uint64_t ValMD5Hash);
 
-  /// Just like getFuncName, except that it will return a non-empty StringRef
-  /// if the function is external to this symbol table. All such cases
-  /// will be represented using the same StringRef value.
-  inline StringRef getFuncNameOrExternalSymbol(uint64_t FuncMD5Hash);
+  /// Just like getFuncOrVarName, except that it will return literal string
+  /// 'External Symbol' if the function or global variable is external to
+  /// this symbol table.
+  inline StringRef getFuncOrVarNameIfDefined(uint64_t ValMD5Hash);
 
   /// True if Symbol is the value used to represent external symbols.
   static bool isExternalSymbol(const StringRef &Symbol) {
@@ -565,19 +565,19 @@ void InstrProfSymtab::finalizeSymtab() {
   Sorted = true;
 }
 
-StringRef InstrProfSymtab::getFuncNameOrExternalSymbol(uint64_t FuncMD5Hash) {
-  StringRef ret = getFuncName(FuncMD5Hash);
+StringRef InstrProfSymtab::getFuncOrVarNameIfDefined(uint64_t MD5Hash) {
+  StringRef ret = getFuncOrVarName(MD5Hash);
   if (ret.empty())
     return InstrProfSymtab::getExternalSymbol();
   return ret;
 }
 
-StringRef InstrProfSymtab::getFuncName(uint64_t FuncMD5Hash) {
+StringRef InstrProfSymtab::getFuncOrVarName(uint64_t MD5Hash) {
   finalizeSymtab();
-  auto Result = llvm::lower_bound(MD5NameMap, FuncMD5Hash,
+  auto Result = llvm::lower_bound(MD5NameMap, MD5Hash,
                                   [](const std::pair<uint64_t, StringRef> &LHS,
                                      uint64_t RHS) { return LHS.first < RHS; });
-  if (Result != MD5NameMap.end() && Result->first == FuncMD5Hash)
+  if (Result != MD5NameMap.end() && Result->first == MD5Hash)
     return Result->second;
   return StringRef();
 }
@@ -949,10 +949,6 @@ void InstrProfRecord::reserveSites(uint32_t ValueKind, uint32_t NumValueSites) {
   getOrCreateValueSitesForKind(ValueKind).reserve(NumValueSites);
 }
 
-inline support::endianness getHostEndianness() {
-  return sys::IsLittleEndianHost ? support::little : support::big;
-}
-
 // Include definitions for value profile data
 #define INSTR_PROF_VALUE_PROF_DATA
 #include "llvm/ProfileData/InstrProfData.inc"
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 4456bf1ab17632..8ba7e186d4fb1a 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -644,7 +644,6 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
        (uint64_t)'p' << 40 | (uint64_t)'r' << 32 | (uint64_t)'o' << 24 |  \
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
-/* FIXME: Please remedy the fixme in the header before bumping the version. */
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
@@ -652,9 +651,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
-/* Profile version is always of type uint64_t. Reserve the upper 8 bits in the
- * version for other variants of profile. We set the lowest bit of the upper 8
- * bits (i.e. bit 56) to 1 to indicate if this is an IR-level instrumentation
+/* Profile version is always of type uint64_t. Reserve the upper 32 bits in the
+ * version for other variants of profile. We set the 8th most significant bit 
+ * (i.e. bit 56) to 1 to indicate if this is an IR-level instrumentation
  * generated profile, and 0 if this is a Clang FE generated profile.
  * 1 in bit 57 indicates there are context-sensitive records in the profile.
  * The 59th bit indicates whether to use debug info to correlate profiles.
@@ -663,7 +662,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 62nd bit indicates whether memory profile information is present.
  * The 63rd bit indicates if this is a temporal profile.
  */
-#define VARIANT_MASKS_ALL 0xff00000000000000ULL
+#define VARIANT_MASKS_ALL 0xffffffff00000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
 #define VARIANT_MASK_IR_PROF (0x1ULL << 56)
 #define VARIANT_MASK_CSIR_PROF (0x1ULL << 57)
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index faf91e017756da..acbc9e21fcc761 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -318,8 +318,8 @@ class RawInstrProfReader : public InstrProfReader {
   /// A list of timestamps paired with a function name reference.
   std::vector<std::pair<uint64_t, uint64_t>> TemporalProfTimestamps;
   bool ShouldSwapBytes;
-  // The value of the version field of the raw profile data header. The lower 56
-  // bits specifies the format version and the most significant 8 bits specify
+  // The value of the version field of the raw profile data header. The lower 32
+  // bits specifies the format version and the most significant 32 bits specify
   // the variant types of the profile.
   uint64_t Version;
   uint64_t CountersDelta;
@@ -413,10 +413,9 @@ class RawInstrProfReader : public InstrProfReader {
   }
 
   support::endianness getDataEndianness() const {
-    support::endianness HostEndian = getHostEndianness();
     if (!ShouldSwapBytes)
-      return HostEndian;
-    if (HostEndian == support::little)
+      return llvm::endianness::native;
+    if (llvm::endianness::native == llvm::endianness::little)
       return support::big;
     else
       return support::little;
@@ -452,7 +451,7 @@ class RawInstrProfReader : public InstrProfReader {
   }
 
   StringRef getName(uint64_t NameRef) const {
-    return Symtab->getFuncName(swap(NameRef));
+    return Symtab->getFuncOrVarName(swap(NameRef));
   }
 
   int getCounterTypeSize() const {
diff --git a/llvm/include/llvm/Support/BlockFrequency.h b/llvm/include/llvm/Support/BlockFrequency.h
index 12a753301b36ab..8b172ee486aab8 100644
--- a/llvm/include/llvm/Support/BlockFrequency.h
+++ b/llvm/include/llvm/Support/BlockFrequency.h
@@ -26,10 +26,11 @@ class BlockFrequency {
   uint64_t Frequency;
 
 public:
-  BlockFrequency(uint64_t Freq = 0) : Frequency(Freq) { }
+  BlockFrequency() : Frequency(0) {}
+  explicit BlockFrequency(uint64_t Freq) : Frequency(Freq) {}
 
   /// Returns the maximum possible frequency, the saturation value.
-  static uint64_t getMaxFrequency() { return UINT64_MAX; }
+  static BlockFrequency max() { return BlockFrequency(UINT64_MAX); }
 
   /// Returns the frequency as a fixpoint number scaled by the entry
   /// frequency.
@@ -112,6 +113,10 @@ class BlockFrequency {
   bool operator==(BlockFrequency RHS) const {
     return Frequency == RHS.Frequency;
   }
+
+  bool operator!=(BlockFrequency RHS) const {
+    return Frequency != RHS.Frequency;
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h
index d407ed11276711..808446e615458f 100644
--- a/llvm/include/llvm/Support/Endian.h
+++ b/llvm/include/llvm/Support/Endian.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_SUPPORT_ENDIAN_H
 #define LLVM_SUPPORT_ENDIAN_H
 
+#include "llvm/ADT/bit.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include <cassert>
@@ -24,7 +25,12 @@
 namespace llvm {
 namespace support {
 
-enum endianness {big, little, native};
+// TODO: Remove the following once we are done migrating to llvm::endianness,
+// llvm::endianness::big, etc.
+using endianness = llvm::endianness;
+constexpr llvm::endianness big = llvm::endianness::big;
+constexpr llvm::endianness little = llvm::endianness::little;
+constexpr llvm::endianness native = llvm::endianness::native;
 
 // These are named values for common alignments.
 enum {aligned = 0, unaligned = 1};
@@ -41,13 +47,13 @@ struct PickAlignment {
 
 namespace endian {
 
-constexpr endianness system_endianness() {
-  return sys::IsBigEndianHost ? big : little;
-}
+LLVM_DEPRECATED("Use llvm::endianness::native instead",
+                "llvm::endianness::native")
+constexpr endianness system_endianness() { return llvm::endianness::native; }
 
 template <typename value_type>
 [[nodiscard]] inline value_type byte_swap(value_type value, endianness endian) {
-  if ((endian != native) && (endian != system_endianness()))
+  if (endian != native)
     sys::swapByteOrder(value);
   return value;
 }
diff --git a/llvm/include/llvm/Support/HashBuilder.h b/llvm/include/llvm/Support/HashBuilder.h
index 04a7b2e7dc8ab2..ccbfe614eb86d2 100644
--- a/llvm/include/llvm/Support/HashBuilder.h
+++ b/llvm/include/llvm/Support/HashBuilder.h
@@ -85,27 +85,67 @@ template <typename HasherT> class HashBuilderBase {
   HasherT &Hasher;
 };
 
-/// Implementation of the `HashBuilder` interface.
+/// Interface to help hash various types through a hasher type.
+///
+/// Via provided specializations of `add`, `addRange`, and `addRangeElements`
+/// functions, various types (e.g. `ArrayRef`, `StringRef`, etc.) can be hashed
+/// without requiring any knowledge of hashed types from the hasher type.
+///
+/// The only method expected from the templated hasher type `HasherT` is:
+/// * void update(ArrayRef<uint8_t> Data)
+///
+/// Additionally, the following methods will be forwarded to the hasher type:
+/// * decltype(std::declval<HasherT &>().final()) final()
+/// * decltype(std::declval<HasherT &>().result()) result()
+///
+/// From a user point of view, the interface provides the following:
+/// * `template<typename T> add(const T &Value)`
+///   The `add` function implements hashing of various types.
+/// * `template <typename ItT> void addRange(ItT First, ItT Last)`
+///   The `addRange` function is designed to aid hashing a range of values.
+///   It explicitly adds the size of the range in the hash.
+/// * `template <typename ItT> void addRangeElements(ItT First, ItT Last)`
+///   The `addRangeElements` function is also designed to aid hashing a range of
+///   values. In contrast to `addRange`, it **ignores** the size of the range,
+///   behaving as if elements were added one at a time with `add`.
 ///
-/// `support::endianness::native` is not supported. `HashBuilder` is
-/// expected to canonicalize `support::endianness::native` to one of
-/// `support::endianness::big` or `support::endianness::little`.
+/// User-defined `struct` types can participate in this interface by providing
+/// an `addHash` templated function. See the associated template specialization
+/// for details.
+///
+/// This interface does not impose requirements on the hasher
+/// `update(ArrayRef<uint8_t> Data)` method. We want to avoid collisions for
+/// variable-size types; for example for
+/// ```
+/// builder.add({1});
+/// builder.add({2, 3});
+/// ```
+/// and
+/// ```
+/// builder.add({1, 2});
+/// builder.add({3});
+/// ```
+/// . Thus, specializations of `add` and `addHash` for variable-size types must
+/// not assume that the hasher type considers the size as part of the hash; they
+/// must explicitly add the size to the hash. See for example specializations
+/// for `ArrayRef` and `StringRef`.
+///
+/// Additionally, since types are eventually forwarded to the hasher's
+/// `void update(ArrayRef<uint8_t>)` method, endianness plays a role in the hash
+/// computation (for example when computing `add((int)123)`).
+/// Specifiying a non-`native` `Endianness` template parameter allows to compute
+/// stable hash across platforms with different endianness.
 template <typename HasherT, support::endianness Endianness>
-class HashBuilderImpl : public HashBuilderBase<HasherT> {
-  static_assert(Endianness != support::endianness::native,
-                "HashBuilder should canonicalize endianness");
-
+class HashBuilder : public HashBuilderBase<HasherT> {
 public:
-  explicit HashBuilderImpl(HasherT &Hasher)
-      : HashBuilderBase<HasherT>(Hasher) {}
+  explicit HashBuilder(HasherT &Hasher) : HashBuilderBase<HasherT>(Hasher) {}
   template <typename... ArgTypes>
-  explicit HashBuilderImpl(ArgTypes &&...Args)
+  explicit HashBuilder(ArgTypes &&...Args)
       : HashBuilderBase<HasherT>(Args...) {}
 
   /// Implement hashing for hashable data types, e.g. integral or enum values.
   template <typename T>
-  std::enable_if_t<hashbuilder_detail::IsHashableData<T>::value,
-                   HashBuilderImpl &>
+  std::enable_if_t<hashbuilder_detail::IsHashableData<T>::value, HashBuilder &>
   add(T Value) {
     return adjustForEndiannessAndAdd(Value);
   }
@@ -123,14 +163,14 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   /// builder.add({3});
   /// ```
   /// do not collide.
-  template <typename T> HashBuilderImpl &add(ArrayRef<T> Value) {
+  template <typename T> HashBuilder &add(ArrayRef<T> Value) {
     // As of implementation time, simply calling `addRange(Value)` would also go
     // through the `update` fast path. But that would rely on the implementation
     // details of `ArrayRef::begin()` and `ArrayRef::end()`. Explicitly call
     // `update` to guarantee the fast path.
     add(Value.size());
     if (hashbuilder_detail::IsHashableData<T>::value &&
-        Endianness == support::endian::system_endianness()) {
+        Endianness == llvm::endianness::native) {
       this->update(ArrayRef(reinterpret_cast<const uint8_t *>(Value.begin()),
                             Value.size() * sizeof(T)));
     } else {
@@ -153,7 +193,7 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   /// builder.add("c");
   /// ```
   /// do not collide.
-  HashBuilderImpl &add(StringRef Value) {
+  HashBuilder &add(StringRef Value) {
     // As of implementation time, simply calling `addRange(Value)` would also go
     // through `update`. But that would rely on the implementation of
     // `StringRef::begin()` and `StringRef::end()`. Explicitly call `update` to
@@ -166,7 +206,7 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
 
   template <typename T>
   using HasAddHashT =
-      decltype(addHash(std::declval<HashBuilderImpl &>(), std::declval<T &>()));
+      decltype(addHash(std::declval<HashBuilder &>(), std::declval<T &>()));
   /// Implement hashing for user-defined `struct`s.
   ///
   /// Any user-define `struct` can participate in hashing via `HashBuilder` by
@@ -186,7 +226,7 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   /// };
   ///
   /// template <typename HasherT, support::endianness Endianness>
-  /// void addHash(HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  /// void addHash(HashBuilder<HasherT, Endianness> &HBuilder,
   ///              const SimpleStruct &Value) {
   ///   HBuilder.add(Value.c);
   ///   HBuilder.add(Value.i);
@@ -206,9 +246,9 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   ///   // If possible, we want to hash both `I` and `C` in a single
   ///   // `update` call for performance concerns.
   ///   template <typename HasherT, support::endianness Endianness>
-  ///   friend void addHash(HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  ///   friend void addHash(HashBuilder<HasherT, Endianness> &HBuilder,
   ///                       const StructWithFastHash &Value) {
-  ///     if (Endianness == support::endian::system_endianness()) {
+  ///     if (Endianness == llvm::endianness::native) {
   ///       HBuilder.update(ArrayRef(
   ///           reinterpret_cast<const uint8_t *>(&Value), sizeof(Value)));
   ///     } else {
@@ -236,9 +276,9 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   ///       Elements[I] = I;
   ///   }
   ///   template <typename HasherT, support::endianness Endianness>
-  ///   friend void addHash(HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  ///   friend void addHash(HashBuilder<HasherT, Endianness> &HBuilder,
   ///                       const CustomContainer &Value) {
-  ///     if (Endianness == support::endian::system_endianness()) {
+  ///     if (Endianness == llvm::endianness::native) {
   ///       HBuilder.update(ArrayRef(
   ///           reinterpret_cast<const uint8_t *>(&Value.Size),
   ///           sizeof(Value.Size) + Value.Size * sizeof(Value.Elements[0])));
@@ -253,18 +293,18 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   template <typename T>
   std::enable_if_t<is_detected<HasAddHashT, T>::value &&
                        !hashbuilder_detail::IsHashableData<T>::value,
-                   HashBuilderImpl &>
+                   HashBuilder &>
   add(const T &Value) {
     addHash(*this, Value);
     return *this;
   }
 
   template <typename T1, typename T2>
-  HashBuilderImpl &add(const std::pair<T1, T2> &Value) {
+  HashBuilder &add(const std::pair<T1, T2> &Value) {
     return add(Value.first, Value.second);
   }
 
-  template <typename... Ts> HashBuilderImpl &add(const std::tuple<Ts...> &Arg) {
+  template <typename... Ts> HashBuilder &add(const std::tuple<Ts...> &Arg) {
     std::apply([this](const auto &...Args) { this->add(Args...); }, Arg);
     return *this;
   }
@@ -280,31 +320,29 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   /// add(Arg2)
   /// ```
   template <typename... Ts>
-  std::enable_if_t<(sizeof...(Ts) > 1), HashBuilderImpl &>
-  add(const Ts &...Args) {
+  std::enable_if_t<(sizeof...(Ts) > 1), HashBuilder &> add(const Ts &...Args) {
     return (add(Args), ...);
   }
 
   template <typename ForwardIteratorT>
-  HashBuilderImpl &addRange(ForwardIteratorT First, ForwardIteratorT Last) {
+  HashBuilder &addRange(ForwardIteratorT First, ForwardIteratorT Last) {
     add(std::distance(First, Last));
     return addRangeElements(First, Last);
   }
 
-  template <typename RangeT> HashBuilderImpl &addRange(const RangeT &Range) {
+  template <typename RangeT> HashBuilder &addRange(const RangeT &Range) {
     return addRange(adl_begin(Range), adl_end(Range));
   }
 
   template <typename ForwardIteratorT>
-  HashBuilderImpl &addRangeElements(ForwardIteratorT First,
-                                    ForwardIteratorT Last) {
+  HashBuilder &addRangeElements(ForwardIteratorT First, ForwardIteratorT Last) {
     return addRangeElementsImpl(
         First, Last,
         typename std::iterator_traits<ForwardIteratorT>::iterator_category());
   }
 
   template <typename RangeT>
-  HashBuilderImpl &addRangeElements(const RangeT &Range) {
+  HashBuilder &addRangeElements(const RangeT &Range) {
     return addRangeElements(adl_begin(Range), adl_end(Range));
   }
 
@@ -313,7 +351,7 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
       std::declval<T &>(), support::endianness::little));
   /// Adjust `Value` for the target endianness and add it to the hash.
   template <typename T>
-  std::enable_if_t<is_detected<HasByteSwapT, T>::value, HashBuilderImpl &>
+  std::enable_if_t<is_detected<HasByteSwapT, T>::value, HashBuilder &>
   adjustForEndiannessAndAdd(const T &Value) {
     T SwappedValue = support::endian::byte_swap(Value, Endianness);
     this->update(ArrayRef(reinterpret_cast<const uint8_t *>(&SwappedValue),
@@ -325,9 +363,9 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   // FIXME: Once available, specialize this function for `contiguous_iterator`s,
   // and use it for `ArrayRef` and `StringRef`.
   template <typename ForwardIteratorT>
-  HashBuilderImpl &addRangeElementsImpl(ForwardIteratorT First,
-                                        ForwardIteratorT Last,
-                                        std::forward_iterator_tag) {
+  HashBuilder &addRangeElementsImpl(ForwardIteratorT First,
+                                    ForwardIteratorT Last,
+                                    std::forward_iterator_tag) {
     for (auto It = First; It != Last; ++It)
       add(*It);
     return *this;
@@ -335,8 +373,8 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
 
   template <typename T>
   std::enable_if_t<hashbuilder_detail::IsHashableData<T>::value &&
-                       Endianness == support::endian::system_endianness(),
-                   HashBuilderImpl &>
+                       Endianness == llvm::endianness::native,
+                   HashBuilder &>
   addRangeElementsImpl(T *First, T *Last, std::forward_iterator_tag) {
     this->update(ArrayRef(reinterpret_cast<const uint8_t *>(First),
                           (Last - First) * sizeof(T)));
@@ -344,62 +382,6 @@ class HashBuilderImpl : public HashBuilderBase<HasherT> {
   }
 };
 
-/// Interface to help hash various types through a hasher type.
-///
-/// Via provided specializations of `add`, `addRange`, and `addRangeElements`
-/// functions, various types (e.g. `ArrayRef`, `StringRef`, etc.) can be hashed
-/// without requiring any knowledge of hashed types from the hasher type.
-///
-/// The only method expected from the templated hasher type `HasherT` is:
-/// * void update(ArrayRef<uint8_t> Data)
-///
-/// Additionally, the following methods will be forwarded to the hasher type:
-/// * decltype(std::declval<HasherT &>().final()) final()
-/// * decltype(std::declval<HasherT &>().result()) result()
-///
-/// From a user point of view, the interface provides the following:
-/// * `template<typename T> add(const T &Value)`
-///   The `add` function implements hashing of various types.
-/// * `template <typename ItT> void addRange(ItT First, ItT Last)`
-///   The `addRange` function is designed to aid hashing a range of values.
-///   It explicitly adds the size of the range in the hash.
-/// * `template <typename ItT> void addRangeElements(ItT First, ItT Last)`
-///   The `addRangeElements` function is also designed to aid hashing a range of
-///   values. In contrast to `addRange`, it **ignores** the size of the range,
-///   behaving as if elements were added one at a time with `add`.
-///
-/// User-defined `struct` types can participate in this interface by providing
-/// an `addHash` templated function. See the associated template specialization
-/// for details.
-///
-/// This interface does not impose requirements on the hasher
-/// `update(ArrayRef<uint8_t> Data)` method. We want to avoid collisions for
-/// variable-size types; for example for
-/// ```
-/// builder.add({1});
-/// builder.add({2, 3});
-/// ```
-/// and
-/// ```
-/// builder.add({1, 2});
-/// builder.add({3});
-/// ```
-/// . Thus, specializations of `add` and `addHash` for variable-size types must
-/// not assume that the hasher type considers the size as part of the hash; they
-/// must explicitly add the size to the hash. See for example specializations
-/// for `ArrayRef` and `StringRef`.
-///
-/// Additionally, since types are eventually forwarded to the hasher's
-/// `void update(ArrayRef<uint8_t>)` method, endianness plays a role in the hash
-/// computation (for example when computing `add((int)123)`).
-/// Specifiying a non-`native` `Endianness` template parameter allows to compute
-/// stable hash across platforms with different endianness.
-template <class HasherT, support::endianness Endianness>
-using HashBuilder =
-    HashBuilderImpl<HasherT, (Endianness == support::endianness::native
-                                  ? support::endian::system_endianness()
-                                  : Endianness)>;
-
 namespace hashbuilder_detail {
 class HashCodeHasher {
 public:
diff --git a/llvm/include/llvm/Support/SwapByteOrder.h b/llvm/include/llvm/Support/SwapByteOrder.h
index 43fa5347d6fe85..8f26af6f68ac65 100644
--- a/llvm/include/llvm/Support/SwapByteOrder.h
+++ b/llvm/include/llvm/Support/SwapByteOrder.h
@@ -19,40 +19,12 @@
 #include <cstdint>
 #include <type_traits>
 
-#if defined(__linux__) || defined(__GNU__) || defined(__HAIKU__) ||            \
-    defined(__Fuchsia__) || defined(__EMSCRIPTEN__)
-#include <endian.h>
-#elif defined(_AIX)
-#include <sys/machine.h>
-#elif defined(__sun)
-/* Solaris provides _BIG_ENDIAN/_LITTLE_ENDIAN selector in sys/types.h */
-#include <sys/types.h>
-#define BIG_ENDIAN 4321
-#define LITTLE_ENDIAN 1234
-#if defined(_BIG_ENDIAN)
-#define BYTE_ORDER BIG_ENDIAN
-#else
-#define BYTE_ORDER LITTLE_ENDIAN
-#endif
-#elif defined(__MVS__)
-#define BIG_ENDIAN 4321
-#define LITTLE_ENDIAN 1234
-#define BYTE_ORDER BIG_ENDIAN
-#else
-#if !defined(BYTE_ORDER) && !defined(_WIN32)
-#include <machine/endian.h>
-#endif
-#endif
-
 namespace llvm {
 
 namespace sys {
 
-#if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
-constexpr bool IsBigEndianHost = true;
-#else
-constexpr bool IsBigEndianHost = false;
-#endif
+constexpr bool IsBigEndianHost =
+    llvm::endianness::native == llvm::endianness::big;
 
 static const bool IsLittleEndianHost = !IsBigEndianHost;
 
diff --git a/llvm/include/llvm/Support/VersionTuple.h b/llvm/include/llvm/Support/VersionTuple.h
index 828a6db54708df..b85500e20d1bdd 100644
--- a/llvm/include/llvm/Support/VersionTuple.h
+++ b/llvm/include/llvm/Support/VersionTuple.h
@@ -22,8 +22,7 @@
 #include <tuple>
 
 namespace llvm {
-template <typename HasherT, support::endianness Endianness>
-class HashBuilderImpl;
+template <typename HasherT, support::endianness Endianness> class HashBuilder;
 class raw_ostream;
 class StringRef;
 
@@ -175,7 +174,7 @@ class VersionTuple {
   }
 
   template <typename HasherT, llvm::support::endianness Endianness>
-  friend void addHash(HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  friend void addHash(HashBuilder<HasherT, Endianness> &HBuilder,
                       const VersionTuple &VT) {
     HBuilder.add(VT.Major, VT.Minor, VT.Subminor, VT.Build);
   }
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index b9845473bf8b33..616f2d79028615 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -89,11 +89,14 @@ enum CPUFeatures {
   FEAT_SME_F64,
   FEAT_SME_I64,
   FEAT_SME2,
-  FEAT_MAX
+  FEAT_RCPC3,
+  FEAT_MAX,
+  FEAT_EXT = 62,
+  FEAT_INIT
 };
 
-static_assert(FEAT_MAX <= 64,
-              "CPUFeatures enum must not have more than 64 entries");
+static_assert(FEAT_MAX < 62,
+              "Number of features in CPUFeatures are limited to 62 entries");
 
 // Arch extension modifiers for CPUs. These are labelled with their Arm ARM
 // feature name (though the canonical reference for those is AArch64.td)
@@ -185,14 +188,14 @@ struct ExtensionInfo {
 // clang-format off
 inline constexpr ExtensionInfo Extensions[] = {
     {"aes", AArch64::AEK_AES, "+aes", "-aes", FEAT_AES, "+fp-armv8,+neon", 150},
-    {"b16b16", AArch64::AEK_B16B16, "+b16b16", "-b16b16", FEAT_MAX, "", 0},
+    {"b16b16", AArch64::AEK_B16B16, "+b16b16", "-b16b16", FEAT_INIT, "", 0},
     {"bf16", AArch64::AEK_BF16, "+bf16", "-bf16", FEAT_BF16, "+bf16", 280},
-    {"brbe", AArch64::AEK_BRBE, "+brbe", "-brbe", FEAT_MAX, "", 0},
+    {"brbe", AArch64::AEK_BRBE, "+brbe", "-brbe", FEAT_INIT, "", 0},
     {"bti", AArch64::AEK_NONE, {}, {}, FEAT_BTI, "+bti", 510},
     {"crc", AArch64::AEK_CRC, "+crc", "-crc", FEAT_CRC, "+crc", 110},
-    {"crypto", AArch64::AEK_CRYPTO, "+crypto", "-crypto", FEAT_MAX, "+aes,+sha2", 0},
-    {"cssc", AArch64::AEK_CSSC, "+cssc", "-cssc", FEAT_MAX, "", 0},
-    {"d128", AArch64::AEK_D128, "+d128", "-d128", FEAT_MAX, "", 0},
+    {"crypto", AArch64::AEK_CRYPTO, "+crypto", "-crypto", FEAT_INIT, "+aes,+sha2", 0},
+    {"cssc", AArch64::AEK_CSSC, "+cssc", "-cssc", FEAT_INIT, "", 0},
+    {"d128", AArch64::AEK_D128, "+d128", "-d128", FEAT_INIT, "", 0},
     {"dgh", AArch64::AEK_NONE, {}, {}, FEAT_DGH, "", 260},
     {"dit", AArch64::AEK_NONE, {}, {}, FEAT_DIT, "+dit", 180},
     {"dotprod", AArch64::AEK_DOTPROD, "+dotprod", "-dotprod", FEAT_DOTPROD, "+dotprod,+fp-armv8,+neon", 50},
@@ -208,30 +211,30 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"fp16", AArch64::AEK_FP16, "+fullfp16", "-fullfp16", FEAT_FP16, "+fullfp16,+fp-armv8,+neon", 170},
     {"fp16fml", AArch64::AEK_FP16FML, "+fp16fml", "-fp16fml", FEAT_FP16FML, "+fp16fml,+fullfp16,+fp-armv8,+neon", 40},
     {"frintts", AArch64::AEK_NONE, {}, {}, FEAT_FRINTTS, "+fptoint", 250},
-    {"hbc", AArch64::AEK_HBC, "+hbc", "-hbc", FEAT_MAX, "", 0},
+    {"hbc", AArch64::AEK_HBC, "+hbc", "-hbc", FEAT_INIT, "", 0},
     {"i8mm", AArch64::AEK_I8MM, "+i8mm", "-i8mm", FEAT_I8MM, "+i8mm", 270},
-    {"ite", AArch64::AEK_ITE, "+ite", "-ite", FEAT_MAX, "", 0},
+    {"ite", AArch64::AEK_ITE, "+ite", "-ite", FEAT_INIT, "", 0},
     {"jscvt", AArch64::AEK_NONE, {}, {}, FEAT_JSCVT, "+fp-armv8,+neon,+jsconv", 210},
     {"ls64_accdata", AArch64::AEK_NONE, {}, {}, FEAT_LS64_ACCDATA, "+ls64", 540},
     {"ls64_v", AArch64::AEK_NONE, {}, {}, FEAT_LS64_V, "", 530},
     {"ls64", AArch64::AEK_LS64, "+ls64", "-ls64", FEAT_LS64, "", 520},
     {"lse", AArch64::AEK_LSE, "+lse", "-lse", FEAT_LSE, "+lse", 80},
-    {"lse128", AArch64::AEK_LSE128, "+lse128", "-lse128", FEAT_MAX, "", 0},
+    {"lse128", AArch64::AEK_LSE128, "+lse128", "-lse128", FEAT_INIT, "", 0},
     {"memtag", AArch64::AEK_MTE, "+mte", "-mte", FEAT_MEMTAG, "", 440},
     {"memtag2", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG2, "+mte", 450},
     {"memtag3", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG3, "+mte", 460},
-    {"mops", AArch64::AEK_MOPS, "+mops", "-mops", FEAT_MAX, "", 0},
-    {"pauth", AArch64::AEK_PAUTH, "+pauth", "-pauth", FEAT_MAX, "", 0},
+    {"mops", AArch64::AEK_MOPS, "+mops", "-mops", FEAT_INIT, "", 0},
+    {"pauth", AArch64::AEK_PAUTH, "+pauth", "-pauth", FEAT_INIT, "", 0},
     {"pmull", AArch64::AEK_NONE, {}, {}, FEAT_PMULL, "+aes,+fp-armv8,+neon", 160},
-    {"pmuv3", AArch64::AEK_PERFMON, "+perfmon", "-perfmon", FEAT_MAX, "", 0},
+    {"pmuv3", AArch64::AEK_PERFMON, "+perfmon", "-perfmon", FEAT_INIT, "", 0},
     {"predres", AArch64::AEK_PREDRES, "+predres", "-predres", FEAT_PREDRES, "+predres", 480},
-    {"predres2", AArch64::AEK_SPECRES2, "+specres2", "-specres2", FEAT_MAX, "", 0},
-    {"profile", AArch64::AEK_PROFILE, "+spe", "-spe", FEAT_MAX, "", 0},
-    {"ras", AArch64::AEK_RAS, "+ras", "-ras", FEAT_MAX, "", 0},
-    {"rasv2", AArch64::AEK_RASv2, "+rasv2", "-rasv2", FEAT_MAX, "", 0},
+    {"predres2", AArch64::AEK_SPECRES2, "+specres2", "-specres2", FEAT_INIT, "", 0},
+    {"profile", AArch64::AEK_PROFILE, "+spe", "-spe", FEAT_INIT, "", 0},
+    {"ras", AArch64::AEK_RAS, "+ras", "-ras", FEAT_INIT, "", 0},
+    {"rasv2", AArch64::AEK_RASv2, "+rasv2", "-rasv2", FEAT_INIT, "", 0},
     {"rcpc", AArch64::AEK_RCPC, "+rcpc", "-rcpc", FEAT_RCPC, "+rcpc", 230},
     {"rcpc2", AArch64::AEK_NONE, {}, {}, FEAT_RCPC2, "+rcpc", 240},
-    {"rcpc3", AArch64::AEK_RCPC3, "+rcpc3", "-rcpc3", FEAT_MAX, "", 0},
+    {"rcpc3", AArch64::AEK_RCPC3, "+rcpc3", "-rcpc3", FEAT_RCPC3, "+rcpc,+rcpc3", 241},
     {"rdm", AArch64::AEK_RDM, "+rdm", "-rdm", FEAT_RDM, "+rdm,+fp-armv8,+neon", 70},
     {"rng", AArch64::AEK_RAND, "+rand", "-rand", FEAT_RNG, "+rand", 10},
     {"rpres", AArch64::AEK_NONE, {}, {}, FEAT_RPRES, "", 300},
@@ -241,12 +244,12 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sha3", AArch64::AEK_SHA3, "+sha3", "-sha3", FEAT_SHA3, "+sha3,+sha2,+fp-armv8,+neon", 140},
     {"simd", AArch64::AEK_SIMD, "+neon", "-neon", FEAT_SIMD, "+fp-armv8,+neon", 100},
     {"sm4", AArch64::AEK_SM4, "+sm4", "-sm4", FEAT_SM4, "+sm4,+fp-armv8,+neon", 60},
-    {"sme-f16f16", AArch64::AEK_SMEF16F16, "+sme-f16f16", "-sme-f16f16", FEAT_MAX, "", 0},
+    {"sme-f16f16", AArch64::AEK_SMEF16F16, "+sme-f16f16", "-sme-f16f16", FEAT_INIT, "", 0},
     {"sme-f64f64", AArch64::AEK_SMEF64F64, "+sme-f64f64", "-sme-f64f64", FEAT_SME_F64, "+sme,+sme-f64f64,+bf16", 560},
     {"sme-i16i64", AArch64::AEK_SMEI16I64, "+sme-i16i64", "-sme-i16i64", FEAT_SME_I64, "+sme,+sme-i16i64,+bf16", 570},
     {"sme", AArch64::AEK_SME, "+sme", "-sme", FEAT_SME, "+sme,+bf16", 430},
     {"sme2", AArch64::AEK_SME2, "+sme2", "-sme2", FEAT_SME2, "+sme2,+sme,+bf16", 580},
-    {"sme2p1", AArch64::AEK_SME2p1, "+sme2p1", "-sme2p1", FEAT_MAX, "", 0},
+    {"sme2p1", AArch64::AEK_SME2p1, "+sme2p1", "-sme2p1", FEAT_INIT, "", 0},
     {"ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs", FEAT_SSBS, "", 490},
     {"ssbs2", AArch64::AEK_NONE, {}, {}, FEAT_SSBS2, "+ssbs", 500},
     {"sve-bf16", AArch64::AEK_NONE, {}, {}, FEAT_SVE_BF16, "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320},
@@ -259,13 +262,13 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3", FEAT_SVE_SHA3, "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410},
     {"sve2-sm4", AArch64::AEK_SVE2SM4, "+sve2-sm4", "-sve2-sm4", FEAT_SVE_SM4, "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420},
     {"sve2", AArch64::AEK_SVE2, "+sve2", "-sve2", FEAT_SVE2, "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370},
-    {"sve2p1", AArch64::AEK_SVE2p1, "+sve2p1", "-sve2p1", FEAT_MAX, "+sve2p1,+sve2,+sve,+fullfp16,+fp-armv8,+neon", 0},
-    {"the", AArch64::AEK_THE, "+the", "-the", FEAT_MAX, "", 0},
-    {"tme", AArch64::AEK_TME, "+tme", "-tme", FEAT_MAX, "", 0},
+    {"sve2p1", AArch64::AEK_SVE2p1, "+sve2p1", "-sve2p1", FEAT_INIT, "+sve2p1,+sve2,+sve,+fullfp16,+fp-armv8,+neon", 0},
+    {"the", AArch64::AEK_THE, "+the", "-the", FEAT_INIT, "", 0},
+    {"tme", AArch64::AEK_TME, "+tme", "-tme", FEAT_INIT, "", 0},
     {"wfxt", AArch64::AEK_NONE, {}, {}, FEAT_WFXT, "+wfxt", 550},
-    {"gcs", AArch64::AEK_GCS, "+gcs", "-gcs", FEAT_MAX, "", 0},
+    {"gcs", AArch64::AEK_GCS, "+gcs", "-gcs", FEAT_INIT, "", 0},
     // Special cases
-    {"none", AArch64::AEK_NONE, {}, {}, FEAT_MAX, "", ExtensionInfo::MaxFMVPriority},
+    {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
 // clang-format on
 
diff --git a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
index 4d31898bb3147b..269441db7a5589 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
@@ -101,7 +101,8 @@ template <class Edge, class BBInfo> class CFGMST {
     LLVM_DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
 
     BasicBlock *Entry = &(F.getEntryBlock());
-    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+    uint64_t EntryWeight =
+        (BFI != nullptr ? BFI->getEntryFreq().getFrequency() : 2);
     // If we want to instrument the entry count, lower the weight to 0.
     if (InstrumentFuncEntry)
       EntryWeight = 0;
diff --git a/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
index b18d04cc73dbca..96c9bfa0e372ce 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -78,14 +78,13 @@ cl::opt<PGOViewCountsType> PGOViewCounts(
                clEnumValN(PGOVCT_Graph, "graph", "show a graph."),
                clEnumValN(PGOVCT_Text, "text", "show in text.")));
 
-static cl::opt<bool> PrintBlockFreq(
-    "print-bfi", cl::init(false), cl::Hidden,
-    cl::desc("Print the block frequency info."));
-
-cl::opt<std::string> PrintBlockFreqFuncName(
-    "print-bfi-func-name", cl::Hidden,
-    cl::desc("The option to specify the name of the function "
-             "whose block frequency info is printed."));
+static cl::opt<bool> PrintBFI("print-bfi", cl::init(false), cl::Hidden,
+                              cl::desc("Print the block frequency info."));
+
+cl::opt<std::string>
+    PrintBFIFuncName("print-bfi-func-name", cl::Hidden,
+                     cl::desc("The option to specify the name of the function "
+                              "whose block frequency info is printed."));
 } // namespace llvm
 
 namespace llvm {
@@ -193,15 +192,14 @@ void BlockFrequencyInfo::calculate(const Function &F,
        F.getName().equals(ViewBlockFreqFuncName))) {
     view();
   }
-  if (PrintBlockFreq &&
-      (PrintBlockFreqFuncName.empty() ||
-       F.getName().equals(PrintBlockFreqFuncName))) {
+  if (PrintBFI &&
+      (PrintBFIFuncName.empty() || F.getName().equals(PrintBFIFuncName))) {
     print(dbgs());
   }
 }
 
 BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
-  return BFI ? BFI->getBlockFreq(BB) : 0;
+  return BFI ? BFI->getBlockFreq(BB) : BlockFrequency(0);
 }
 
 std::optional<uint64_t>
@@ -214,7 +212,7 @@ BlockFrequencyInfo::getBlockProfileCount(const BasicBlock *BB,
 }
 
 std::optional<uint64_t>
-BlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
+BlockFrequencyInfo::getProfileCountFromFreq(BlockFrequency Freq) const {
   if (!BFI)
     return std::nullopt;
   return BFI->getProfileCountFromFreq(*getFunction(), Freq);
@@ -225,17 +223,18 @@ bool BlockFrequencyInfo::isIrrLoopHeader(const BasicBlock *BB) {
   return BFI->isIrrLoopHeader(BB);
 }
 
-void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) {
+void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB,
+                                      BlockFrequency Freq) {
   assert(BFI && "Expected analysis to be available");
   BFI->setBlockFreq(BB, Freq);
 }
 
 void BlockFrequencyInfo::setBlockFreqAndScale(
-    const BasicBlock *ReferenceBB, uint64_t Freq,
+    const BasicBlock *ReferenceBB, BlockFrequency Freq,
     SmallPtrSetImpl<BasicBlock *> &BlocksToScale) {
   assert(BFI && "Expected analysis to be available");
   // Use 128 bits APInt to avoid overflow.
-  APInt NewFreq(128, Freq);
+  APInt NewFreq(128, Freq.getFrequency());
   APInt OldFreq(128, BFI->getBlockFreq(ReferenceBB).getFrequency());
   APInt BBFreq(128, 0);
   for (auto *BB : BlocksToScale) {
@@ -247,7 +246,7 @@ void BlockFrequencyInfo::setBlockFreqAndScale(
     // a hot spot, one of the options proposed in
     // https://reviews.llvm.org/D28535#650071 could be used to avoid this.
     BBFreq = BBFreq.udiv(OldFreq);
-    BFI->setBlockFreq(BB, BBFreq.getLimitedValue());
+    BFI->setBlockFreq(BB, BlockFrequency(BBFreq.getLimitedValue()));
   }
   BFI->setBlockFreq(ReferenceBB, Freq);
 }
@@ -266,19 +265,8 @@ const BranchProbabilityInfo *BlockFrequencyInfo::getBPI() const {
   return BFI ? &BFI->getBPI() : nullptr;
 }
 
-raw_ostream &BlockFrequencyInfo::
-printBlockFreq(raw_ostream &OS, const BlockFrequency Freq) const {
-  return BFI ? BFI->printBlockFreq(OS, Freq) : OS;
-}
-
-raw_ostream &
-BlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
-                                   const BasicBlock *BB) const {
-  return BFI ? BFI->printBlockFreq(OS, BB) : OS;
-}
-
-uint64_t BlockFrequencyInfo::getEntryFreq() const {
-  return BFI ? BFI->getEntryFreq() : 0;
+BlockFrequency BlockFrequencyInfo::getEntryFreq() const {
+  return BFI ? BFI->getEntryFreq() : BlockFrequency(0);
 }
 
 void BlockFrequencyInfo::releaseMemory() { BFI.reset(); }
@@ -293,6 +281,18 @@ void BlockFrequencyInfo::verifyMatch(BlockFrequencyInfo &Other) const {
     BFI->verifyMatch(*Other.BFI);
 }
 
+Printable llvm::printBlockFreq(const BlockFrequencyInfo &BFI,
+                               BlockFrequency Freq) {
+  return Printable([&BFI, Freq](raw_ostream &OS) {
+    printBlockFreqImpl(OS, BFI.getEntryFreq(), Freq);
+  });
+}
+
+Printable llvm::printBlockFreq(const BlockFrequencyInfo &BFI,
+                               const BasicBlock &BB) {
+  return printBlockFreq(BFI, BFI.getBlockFreq(&BB));
+}
+
 INITIALIZE_PASS_BEGIN(BlockFrequencyInfoWrapperPass, "block-freq",
                       "Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 82b1e3b9eede70..6f944990c78674 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -581,30 +581,27 @@ BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
       report_fatal_error(OS.str());
     }
 #endif
-    return 0;
+    return BlockFrequency(0);
   }
-  return Freqs[Node.Index].Integer;
+  return BlockFrequency(Freqs[Node.Index].Integer);
 }
 
 std::optional<uint64_t>
 BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F,
                                                  const BlockNode &Node,
                                                  bool AllowSynthetic) const {
-  return getProfileCountFromFreq(F, getBlockFreq(Node).getFrequency(),
-                                 AllowSynthetic);
+  return getProfileCountFromFreq(F, getBlockFreq(Node), AllowSynthetic);
 }
 
-std::optional<uint64_t>
-BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F,
-                                                    uint64_t Freq,
-                                                    bool AllowSynthetic) const {
+std::optional<uint64_t> BlockFrequencyInfoImplBase::getProfileCountFromFreq(
+    const Function &F, BlockFrequency Freq, bool AllowSynthetic) const {
   auto EntryCount = F.getEntryCount(AllowSynthetic);
   if (!EntryCount)
     return std::nullopt;
   // Use 128 bit APInt to do the arithmetic to avoid overflow.
   APInt BlockCount(128, EntryCount->getCount());
-  APInt BlockFreq(128, Freq);
-  APInt EntryFreq(128, getEntryFreq());
+  APInt BlockFreq(128, Freq.getFrequency());
+  APInt EntryFreq(128, getEntryFreq().getFrequency());
   BlockCount *= BlockFreq;
   // Rounded division of BlockCount by EntryFreq. Since EntryFreq is unsigned
   // lshr by 1 gives EntryFreq/2.
@@ -627,10 +624,10 @@ BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
 }
 
 void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node,
-                                              uint64_t Freq) {
+                                              BlockFrequency Freq) {
   assert(Node.isValid() && "Expected valid node");
   assert(Node.Index < Freqs.size() && "Expected legal index");
-  Freqs[Node.Index].Integer = Freq;
+  Freqs[Node.Index].Integer = Freq.getFrequency();
 }
 
 std::string
@@ -643,19 +640,19 @@ BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {
   return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*");
 }
 
-raw_ostream &
-BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
-                                           const BlockNode &Node) const {
-  return OS << getFloatingBlockFreq(Node);
-}
-
-raw_ostream &
-BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
-                                           const BlockFrequency &Freq) const {
+void llvm::printBlockFreqImpl(raw_ostream &OS, BlockFrequency EntryFreq,
+                              BlockFrequency Freq) {
+  if (Freq == BlockFrequency(0)) {
+    OS << "0";
+    return;
+  }
+  if (EntryFreq == BlockFrequency(0)) {
+    OS << "<invalid BFI>";
+    return;
+  }
   Scaled64 Block(Freq.getFrequency(), 0);
-  Scaled64 Entry(getEntryFreq(), 0);
-
-  return OS << Block / Entry;
+  Scaled64 Entry(EntryFreq.getFrequency(), 0);
+  OS << Block / Entry;
 }
 
 void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {
diff --git a/llvm/lib/Analysis/CFGPrinter.cpp b/llvm/lib/Analysis/CFGPrinter.cpp
index f05dd6852d6dc9..9f55371f259b2c 100644
--- a/llvm/lib/Analysis/CFGPrinter.cpp
+++ b/llvm/lib/Analysis/CFGPrinter.cpp
@@ -318,10 +318,11 @@ bool DOTGraphTraits<DOTFuncInfo *>::isNodeHidden(const BasicBlock *Node,
                                                  const DOTFuncInfo *CFGInfo) {
   if (HideColdPaths.getNumOccurrences() > 0)
     if (auto *BFI = CFGInfo->getBFI()) {
-      uint64_t NodeFreq = BFI->getBlockFreq(Node).getFrequency();
-      uint64_t EntryFreq = BFI->getEntryFreq();
+      BlockFrequency NodeFreq = BFI->getBlockFreq(Node);
+      BlockFrequency EntryFreq = BFI->getEntryFreq();
       // Hide blocks with relative frequency below HideColdPaths threshold.
-      if ((double)NodeFreq / EntryFreq < HideColdPaths)
+      if ((double)NodeFreq.getFrequency() / EntryFreq.getFrequency() <
+          HideColdPaths)
         return true;
     }
   if (HideUnreachablePaths || HideDeoptimizePaths) {
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index fab73c20f3f2d8..2f164e80f24060 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1435,7 +1435,7 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
                                             /*IsSigned=*/false);
       }
     }
-    return ConstantExpr::getCast(Opcode, C, DestTy);
+    break;
   case Instruction::IntToPtr:
     // If the input is a ptrtoint, turn the pair into a ptr to ptr bitcast if
     // the int size is >= the ptr size and the address spaces are the same.
@@ -1454,8 +1454,7 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
         }
       }
     }
-
-    return ConstantExpr::getCast(Opcode, C, DestTy);
+    break;
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -1466,10 +1465,14 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
   case Instruction::FPToUI:
   case Instruction::FPToSI:
   case Instruction::AddrSpaceCast:
-      return ConstantExpr::getCast(Opcode, C, DestTy);
+    break;
   case Instruction::BitCast:
     return FoldBitCast(C, DestTy, DL);
   }
+
+  if (ConstantExpr::isDesirableCastOp(Opcode))
+    return ConstantExpr::getCast(Opcode, C, DestTy);
+  return ConstantFoldCastInstruction(Opcode, C, DestTy);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 4fcf5575c74b0a..fa0c30637633df 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -88,10 +88,21 @@ static cl::opt<bool> InlineEnableCostBenefitAnalysis(
     "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
     cl::desc("Enable the cost-benefit analysis for the inliner"));
 
+// InlineSavingsMultiplier overrides per TTI multipliers iff it is
+// specified explicitly in command line options. This option is exposed
+// for tuning and testing.
 static cl::opt<int> InlineSavingsMultiplier(
     "inline-savings-multiplier", cl::Hidden, cl::init(8),
     cl::desc("Multiplier to multiply cycle savings by during inlining"));
 
+// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
+// specified explicitly in command line options. This option is exposed
+// for tuning and testing.
+static cl::opt<int> InlineSavingsProfitableMultiplier(
+    "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
+    cl::desc("A multiplier on top of cycle savings to decide whether the "
+             "savings won't justify the cost"));
+
 static cl::opt<int>
     InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
                         cl::desc("The maximum size of a callee that get's "
@@ -815,6 +826,32 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     return true;
   }
 
+  // A helper function to choose between command line override and default.
+  unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
+    if (InlineSavingsMultiplier.getNumOccurrences())
+      return InlineSavingsMultiplier;
+    return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
+  }
+
+  // A helper function to choose between command line override and default.
+  unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
+    if (InlineSavingsProfitableMultiplier.getNumOccurrences())
+      return InlineSavingsProfitableMultiplier;
+    return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
+  }
+
+  void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
+    if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
+            CandidateCall, "inline-cycle-savings-for-test")) {
+      CycleSavings = *AttrCycleSavings;
+    }
+
+    if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
+            CandidateCall, "inline-runtime-cost-for-test")) {
+      Size = *AttrRuntimeCost;
+    }
+  }
+
   // Determine whether we should inline the given call site, taking into account
   // both the size cost and the cycle savings.  Return std::nullopt if we don't
   // have sufficient profiling information to determine.
@@ -884,29 +921,55 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     CycleSavings += getCallsiteCost(this->CandidateCall, DL);
     CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
 
-    // Remove the cost of the cold basic blocks.
+    // Remove the cost of the cold basic blocks to model the runtime cost more
+    // accurately. Both machine block placement and function splitting could
+    // place cold blocks further from hot blocks.
     int Size = Cost - ColdSize;
 
     // Allow tiny callees to be inlined regardless of whether they meet the
     // savings threshold.
     Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
 
+    OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
     CostBenefit.emplace(APInt(128, Size), CycleSavings);
 
-    // Return true if the savings justify the cost of inlining.  Specifically,
-    // we evaluate the following inequality:
+    // Let R be the ratio of CycleSavings to Size.  We accept the inlining
+    // opportunity if R is really high and reject if R is really low.  If R is
+    // somewhere in the middle, we fall back to the cost-based analysis.
     //
-    //  CycleSavings      PSI->getOrCompHotCountThreshold()
-    // -------------- >= -----------------------------------
-    //       Size              InlineSavingsMultiplier
+    // Specifically, let R = CycleSavings / Size, we accept the inlining
+    // opportunity if:
     //
-    // Note that the left hand side is specific to a call site.  The right hand
-    // side is a constant for the entire executable.
-    APInt LHS = CycleSavings;
-    LHS *= InlineSavingsMultiplier;
-    APInt RHS(128, PSI->getOrCompHotCountThreshold());
-    RHS *= Size;
-    return LHS.uge(RHS);
+    //             PSI->getOrCompHotCountThreshold()
+    // R > -------------------------------------------------
+    //     getInliningCostBenefitAnalysisSavingsMultiplier()
+    //
+    // and reject the inlining opportunity if:
+    //
+    //                PSI->getOrCompHotCountThreshold()
+    // R <= ----------------------------------------------------
+    //      getInliningCostBenefitAnalysisProfitableMultiplier()
+    //
+    // Otherwise, we fall back to the cost-based analysis.
+    //
+    // Implementation-wise, use multiplication (CycleSavings * Multiplier,
+    // HotCountThreshold * Size) rather than division to avoid precision loss.
+    APInt Threshold(128, PSI->getOrCompHotCountThreshold());
+    Threshold *= Size;
+
+    APInt UpperBoundCycleSavings = CycleSavings;
+    UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
+    if (UpperBoundCycleSavings.uge(Threshold))
+      return true;
+
+    APInt LowerBoundCycleSavings = CycleSavings;
+    LowerBoundCycleSavings *=
+        getInliningCostBenefitAnalysisProfitableMultiplier();
+    if (LowerBoundCycleSavings.ult(Threshold))
+      return false;
+
+    // Otherwise, fall back to the cost-based analysis.
+    return std::nullopt;
   }
 
   InlineResult finalizeAnalysis() override {
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 0660a9993b6dd7..75eb8ece2e447e 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -192,7 +192,9 @@ void MLInlineAdvisor::onPassEntry(LazyCallGraph::SCC *LastSCC) {
   // - in addition, if new Nodes were created by a pass (e.g. CoroSplit),
   // they'd be adjacent to Nodes in the last SCC. So we just need to check the
   // boundary of Nodes in NodesInLastSCC for Nodes we haven't seen. We don't
-  // care about the nature of the Edge (call or ref).
+  // care about the nature of the Edge (call or ref). `FunctionLevels`-wise, we
+  // record them at the same level as the original node (this is a choice, may
+  // need revisiting).
   NodeCount -= static_cast<int64_t>(NodesInLastSCC.size());
   while (!NodesInLastSCC.empty()) {
     const auto *N = *NodesInLastSCC.begin();
@@ -204,12 +206,15 @@ void MLInlineAdvisor::onPassEntry(LazyCallGraph::SCC *LastSCC) {
     }
     ++NodeCount;
     EdgeCount += getLocalCalls(N->getFunction());
+    const auto NLevel = FunctionLevels.at(N);
     for (const auto &E : *(*N)) {
       const auto *AdjNode = &E.getNode();
       assert(!AdjNode->isDead() && !AdjNode->getFunction().isDeclaration());
       auto I = AllNodes.insert(AdjNode);
-      if (I.second)
+      if (I.second) {
         NodesInLastSCC.insert(AdjNode);
+        FunctionLevels[AdjNode] = NLevel;
+      }
     }
   }
 
@@ -461,6 +466,12 @@ void MLInlineAdvisor::print(raw_ostream &OS) const {
     OS << "\n";
   }
   OS << "\n";
+  OS << "[MLInlineAdvisor] FuncLevels:\n";
+  for (auto I : FunctionLevels)
+    OS << (I.first->isDead() ? "<deleted>" : I.first->getFunction().getName())
+       << " : " << I.second << "\n";
+
+  OS << "\n";
 }
 
 MLInlineAdvice::MLInlineAdvice(MLInlineAdvisor *Advisor, CallBase &CB,
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index a88622efa12db8..058a107691674c 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -413,7 +413,7 @@ static void computeFunctionSummary(
         // information.
         if (BFI != nullptr && Hotness == CalleeInfo::HotnessType::Unknown) {
           uint64_t BBFreq = BFI->getBlockFreq(&BB).getFrequency();
-          uint64_t EntryFreq = BFI->getEntryFreq();
+          uint64_t EntryFreq = BFI->getEntryFreq().getFrequency();
           ValueInfo.updateRelBlockFreq(BBFreq, EntryFreq);
         }
       } else {
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 15ba6468a30708..7b38583e9bc75f 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
@@ -44,6 +45,14 @@ StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] =
 #include "llvm/Analysis/TargetLibraryInfo.def"
 };
 
+std::string VecDesc::getVectorFunctionABIVariantString() const {
+  assert(!VectorFnName.empty() && "Vector function name must not be empty.");
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  Out << VABIPrefix << "_" << ScalarFnName << "(" << VectorFnName << ")";
+  return std::string(Out.str());
+}
+
 // Recognized types of library function arguments and return types.
 enum FuncArgTypeID : char {
   Void = 0, // Must be zero.
@@ -1138,15 +1147,15 @@ void TargetLibraryInfoImpl::disableAllFunctions() {
 }
 
 static bool compareByScalarFnName(const VecDesc &LHS, const VecDesc &RHS) {
-  return LHS.ScalarFnName < RHS.ScalarFnName;
+  return LHS.getScalarFnName() < RHS.getScalarFnName();
 }
 
 static bool compareByVectorFnName(const VecDesc &LHS, const VecDesc &RHS) {
-  return LHS.VectorFnName < RHS.VectorFnName;
+  return LHS.getVectorFnName() < RHS.getVectorFnName();
 }
 
 static bool compareWithScalarFnName(const VecDesc &LHS, StringRef S) {
-  return LHS.ScalarFnName < S;
+  return LHS.getScalarFnName() < S;
 }
 
 void TargetLibraryInfoImpl::addVectorizableFunctions(ArrayRef<VecDesc> Fns) {
@@ -1203,17 +1212,20 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
   case SLEEFGNUABI: {
     const VecDesc VecFuncs_VF2[] = {
 #define TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) {SCAL, VEC, VF, /* MASK = */ false},
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
+  {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
     };
     const VecDesc VecFuncs_VF4[] = {
 #define TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) {SCAL, VEC, VF, /* MASK = */ false},
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
+  {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
     };
     const VecDesc VecFuncs_VFScalable[] = {
 #define TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK) {SCAL, VEC, VF, MASK},
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
+  {SCAL, VEC, VF, MASK, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
     };
 
@@ -1232,7 +1244,8 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
   case ArmPL: {
     const VecDesc VecFuncs[] = {
 #define TLI_DEFINE_ARMPL_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK) {SCAL, VEC, VF, MASK},
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
+  {SCAL, VEC, VF, MASK, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
     };
 
@@ -1258,23 +1271,32 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const {
 
   std::vector<VecDesc>::const_iterator I =
       llvm::lower_bound(VectorDescs, funcName, compareWithScalarFnName);
-  return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName;
+  return I != VectorDescs.end() && StringRef(I->getScalarFnName()) == funcName;
 }
 
 StringRef TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
                                                        const ElementCount &VF,
                                                        bool Masked) const {
+  const VecDesc *VD = getVectorMappingInfo(F, VF, Masked);
+  if (VD)
+    return VD->getVectorFnName();
+  return StringRef();
+}
+
+const VecDesc *
+TargetLibraryInfoImpl::getVectorMappingInfo(StringRef F, const ElementCount &VF,
+                                            bool Masked) const {
   F = sanitizeFunctionName(F);
   if (F.empty())
-    return F;
+    return nullptr;
   std::vector<VecDesc>::const_iterator I =
       llvm::lower_bound(VectorDescs, F, compareWithScalarFnName);
-  while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) {
-    if ((I->VectorizationFactor == VF) && (I->Masked == Masked))
-      return I->VectorFnName;
+  while (I != VectorDescs.end() && StringRef(I->getScalarFnName()) == F) {
+    if ((I->getVectorizationFactor() == VF) && (I->isMasked() == Masked))
+      return &(*I);
     ++I;
   }
-  return StringRef();
+  return nullptr;
 }
 
 TargetLibraryInfo TargetLibraryAnalysis::run(const Function &F,
@@ -1346,11 +1368,11 @@ void TargetLibraryInfoImpl::getWidestVF(StringRef ScalarF,
 
   std::vector<VecDesc>::const_iterator I =
       llvm::lower_bound(VectorDescs, ScalarF, compareWithScalarFnName);
-  while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == ScalarF) {
+  while (I != VectorDescs.end() && StringRef(I->getScalarFnName()) == ScalarF) {
     ElementCount *VF =
-        I->VectorizationFactor.isScalable() ? &ScalableVF : &FixedVF;
-    if (ElementCount::isKnownGT(I->VectorizationFactor, *VF))
-      *VF = I->VectorizationFactor;
+        I->getVectorizationFactor().isScalable() ? &ScalableVF : &FixedVF;
+    if (ElementCount::isKnownGT(I->getVectorizationFactor(), *VF))
+      *VF = I->getVectorizationFactor();
     ++I;
   }
 }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c751d174a48ab1..aad14f21d11461 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -212,6 +212,17 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
   return TTIImpl->getInliningThresholdMultiplier();
 }
 
+unsigned
+TargetTransformInfo::getInliningCostBenefitAnalysisSavingsMultiplier() const {
+  return TTIImpl->getInliningCostBenefitAnalysisSavingsMultiplier();
+}
+
+unsigned
+TargetTransformInfo::getInliningCostBenefitAnalysisProfitableMultiplier()
+    const {
+  return TTIImpl->getInliningCostBenefitAnalysisProfitableMultiplier();
+}
+
 unsigned
 TargetTransformInfo::adjustInliningThreshold(const CallBase *CB) const {
   return TTIImpl->adjustInliningThreshold(CB);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0736ef65b30651..3af5a6d9a167de 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -163,9 +163,9 @@ void llvm::computeKnownBits(const Value *V, KnownBits &Known,
                             const DataLayout &DL, unsigned Depth,
                             AssumptionCache *AC, const Instruction *CxtI,
                             const DominatorTree *DT, bool UseInstrInfo) {
-  ::computeKnownBits(V, Known, Depth,
-                     SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                   safeCxtI(V, CxtI), UseInstrInfo));
+  ::computeKnownBits(
+      V, Known, Depth,
+      SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
 void llvm::computeKnownBits(const Value *V, const APInt &DemandedElts,
@@ -173,9 +173,9 @@ void llvm::computeKnownBits(const Value *V, const APInt &DemandedElts,
                             unsigned Depth, AssumptionCache *AC,
                             const Instruction *CxtI, const DominatorTree *DT,
                             bool UseInstrInfo) {
-  ::computeKnownBits(V, DemandedElts, Known, Depth,
-                     SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                   safeCxtI(V, CxtI), UseInstrInfo));
+  ::computeKnownBits(
+      V, DemandedElts, Known, Depth,
+      SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
 static KnownBits computeKnownBits(const Value *V, const APInt &DemandedElts,
@@ -188,18 +188,17 @@ KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
                                  unsigned Depth, AssumptionCache *AC,
                                  const Instruction *CxtI,
                                  const DominatorTree *DT, bool UseInstrInfo) {
-  return ::computeKnownBits(V, Depth,
-                            SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                          safeCxtI(V, CxtI), UseInstrInfo));
+  return ::computeKnownBits(
+      V, Depth, SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
 KnownBits llvm::computeKnownBits(const Value *V, const APInt &DemandedElts,
                                  const DataLayout &DL, unsigned Depth,
                                  AssumptionCache *AC, const Instruction *CxtI,
                                  const DominatorTree *DT, bool UseInstrInfo) {
-  return ::computeKnownBits(V, DemandedElts, Depth,
-                            SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                          safeCxtI(V, CxtI), UseInstrInfo));
+  return ::computeKnownBits(
+      V, DemandedElts, Depth,
+      SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
@@ -275,10 +274,9 @@ bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
                                   bool OrZero, unsigned Depth,
                                   AssumptionCache *AC, const Instruction *CxtI,
                                   const DominatorTree *DT, bool UseInstrInfo) {
-  return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth,
-                                  SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                                safeCxtI(V, CxtI),
-                                                UseInstrInfo));
+  return ::isKnownToBeAPowerOfTwo(
+      V, OrZero, Depth,
+      SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
 static bool isKnownNonZero(const Value *V, const APInt &DemandedElts,
@@ -290,9 +288,8 @@ static bool isKnownNonZero(const Value *V, unsigned Depth,
 bool llvm::isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth,
                           AssumptionCache *AC, const Instruction *CxtI,
                           const DominatorTree *DT, bool UseInstrInfo) {
-  return ::isKnownNonZero(V, Depth,
-                          SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                        safeCxtI(V, CxtI), UseInstrInfo));
+  return ::isKnownNonZero(
+      V, Depth, SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
 bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
@@ -329,9 +326,9 @@ bool llvm::isKnownNonEqual(const Value *V1, const Value *V2,
                            const DataLayout &DL, AssumptionCache *AC,
                            const Instruction *CxtI, const DominatorTree *DT,
                            bool UseInstrInfo) {
-  return ::isKnownNonEqual(V1, V2, 0,
-                           SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                         safeCxtI(V2, V1, CxtI), UseInstrInfo));
+  return ::isKnownNonEqual(
+      V1, V2, 0,
+      SimplifyQuery(DL, DT, AC, safeCxtI(V2, V1, CxtI), UseInstrInfo));
 }
 
 static bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
@@ -341,9 +338,9 @@ bool llvm::MaskedValueIsZero(const Value *V, const APInt &Mask,
                              const DataLayout &DL, unsigned Depth,
                              AssumptionCache *AC, const Instruction *CxtI,
                              const DominatorTree *DT, bool UseInstrInfo) {
-  return ::MaskedValueIsZero(V, Mask, Depth,
-                             SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                           safeCxtI(V, CxtI), UseInstrInfo));
+  return ::MaskedValueIsZero(
+      V, Mask, Depth,
+      SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
 static unsigned ComputeNumSignBits(const Value *V, const APInt &DemandedElts,
@@ -361,9 +358,8 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
                                   unsigned Depth, AssumptionCache *AC,
                                   const Instruction *CxtI,
                                   const DominatorTree *DT, bool UseInstrInfo) {
-  return ::ComputeNumSignBits(V, Depth,
-                              SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                            safeCxtI(V, CxtI), UseInstrInfo));
+  return ::ComputeNumSignBits(
+      V, Depth, SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
 unsigned llvm::ComputeMaxSignificantBits(const Value *V, const DataLayout &DL,
@@ -956,10 +952,9 @@ KnownBits llvm::analyzeKnownBitsFromAndXorOr(
   APInt DemandedElts =
       FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1);
 
-  return getKnownBitsFromAndXorOr(I, DemandedElts, KnownLHS, KnownRHS, Depth,
-                                  SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC,
-                                                safeCxtI(I, CxtI),
-                                                UseInstrInfo));
+  return getKnownBitsFromAndXorOr(
+      I, DemandedElts, KnownLHS, KnownRHS, Depth,
+      SimplifyQuery(DL, DT, AC, safeCxtI(I, CxtI), UseInstrInfo));
 }
 
 ConstantRange llvm::getVScaleRange(const Function *F, unsigned BitWidth) {
@@ -4008,7 +4003,7 @@ std::pair<Value *, FPClassTest> llvm::fcmpToClassTest(FCmpInst::Predicate Pred,
                                                       bool LookThroughSrc) {
   const APFloat *ConstRHS;
   if (!match(RHS, m_APFloatAllowUndef(ConstRHS)))
-    return {nullptr, fcNone};
+    return {nullptr, fcAllFlags};
 
   return fcmpToClassTest(Pred, F, LHS, ConstRHS, LookThroughSrc);
 }
@@ -4030,7 +4025,7 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
     // TODO: Handle DAZ by expanding masks to cover subnormal cases.
     if (Pred != FCmpInst::FCMP_ORD && Pred != FCmpInst::FCMP_UNO &&
         !inputDenormalIsIEEE(F, LHS->getType()))
-      return {nullptr, fcNone};
+      return {nullptr, fcAllFlags};
 
     switch (Pred) {
     case FCmpInst::FCMP_OEQ: // Match x == 0.0
@@ -4067,7 +4062,7 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
       break;
     }
 
-    return {nullptr, fcNone};
+    return {nullptr, fcAllFlags};
   }
 
   Value *Src = LHS;
@@ -4151,7 +4146,7 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
     case FCmpInst::FCMP_OGE:
     case FCmpInst::FCMP_ULT: {
       if (ConstRHS->isNegative()) // TODO
-        return {nullptr, fcNone};
+        return {nullptr, fcAllFlags};
 
       // fcmp oge fabs(x), +inf -> fcInf
       // fcmp oge x, +inf -> fcPosInf
@@ -4165,14 +4160,14 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
     case FCmpInst::FCMP_OGT:
     case FCmpInst::FCMP_ULE: {
       if (ConstRHS->isNegative())
-        return {nullptr, fcNone};
+        return {nullptr, fcAllFlags};
 
       // No value is ordered and greater than infinity.
       Mask = fcNone;
       break;
     }
     default:
-      return {nullptr, fcNone};
+      return {nullptr, fcAllFlags};
     }
   } else if (ConstRHS->isSmallestNormalized() && !ConstRHS->isNegative()) {
     // Match pattern that's used in __builtin_isnormal.
@@ -4201,14 +4196,14 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
       break;
     }
     default:
-      return {nullptr, fcNone};
+      return {nullptr, fcAllFlags};
     }
   } else if (ConstRHS->isNaN()) {
     // fcmp o__ x, nan -> false
     // fcmp u__ x, nan -> true
     Mask = fcNone;
   } else
-    return {nullptr, fcNone};
+    return {nullptr, fcAllFlags};
 
   // Invert the comparison for the unordered cases.
   if (FCmpInst::isUnordered(Pred))
@@ -6383,9 +6378,8 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
       (LHSRange.isAllNegative() || RHSRange.isAllNegative());
   if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
     KnownBits AddKnown(LHSRange.getBitWidth());
-    computeKnownBitsFromAssume(
-        Add, AddKnown, /*Depth=*/0,
-        SimplifyQuery(DL, /*TLI*/ nullptr, DT, AC, CxtI, DT));
+    computeKnownBitsFromAssume(Add, AddKnown, /*Depth=*/0,
+                               SimplifyQuery(DL, DT, AC, CxtI, DT));
     if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) ||
         (AddKnown.isNegative() && LHSOrRHSKnownNegative))
       return OverflowResult::NeverOverflows;
@@ -8758,56 +8752,50 @@ static ConstantRange getRangeForIntrinsic(const IntrinsicInst &II) {
   return ConstantRange::getFull(Width);
 }
 
-static void setLimitsForSelectPattern(const SelectInst &SI, APInt &Lower,
-                                      APInt &Upper, const InstrInfoQuery &IIQ) {
+static ConstantRange getRangeForSelectPattern(const SelectInst &SI,
+                                              const InstrInfoQuery &IIQ) {
+  unsigned BitWidth = SI.getType()->getScalarSizeInBits();
   const Value *LHS = nullptr, *RHS = nullptr;
   SelectPatternResult R = matchSelectPattern(&SI, LHS, RHS);
   if (R.Flavor == SPF_UNKNOWN)
-    return;
-
-  unsigned BitWidth = SI.getType()->getScalarSizeInBits();
+    return ConstantRange::getFull(BitWidth);
 
   if (R.Flavor == SelectPatternFlavor::SPF_ABS) {
     // If the negation part of the abs (in RHS) has the NSW flag,
     // then the result of abs(X) is [0..SIGNED_MAX],
     // otherwise it is [0..SIGNED_MIN], as -SIGNED_MIN == SIGNED_MIN.
-    Lower = APInt::getZero(BitWidth);
     if (match(RHS, m_Neg(m_Specific(LHS))) &&
         IIQ.hasNoSignedWrap(cast<Instruction>(RHS)))
-      Upper = APInt::getSignedMaxValue(BitWidth) + 1;
-    else
-      Upper = APInt::getSignedMinValue(BitWidth) + 1;
-    return;
+      return ConstantRange::getNonEmpty(APInt::getZero(BitWidth),
+                                        APInt::getSignedMaxValue(BitWidth) + 1);
+
+    return ConstantRange::getNonEmpty(APInt::getZero(BitWidth),
+                                      APInt::getSignedMinValue(BitWidth) + 1);
   }
 
   if (R.Flavor == SelectPatternFlavor::SPF_NABS) {
     // The result of -abs(X) is <= 0.
-    Lower = APInt::getSignedMinValue(BitWidth);
-    Upper = APInt(BitWidth, 1);
-    return;
+    return ConstantRange::getNonEmpty(APInt::getSignedMinValue(BitWidth),
+                                      APInt(BitWidth, 1));
   }
 
   const APInt *C;
   if (!match(LHS, m_APInt(C)) && !match(RHS, m_APInt(C)))
-    return;
+    return ConstantRange::getFull(BitWidth);
 
   switch (R.Flavor) {
-    case SPF_UMIN:
-      Upper = *C + 1;
-      break;
-    case SPF_UMAX:
-      Lower = *C;
-      break;
-    case SPF_SMIN:
-      Lower = APInt::getSignedMinValue(BitWidth);
-      Upper = *C + 1;
-      break;
-    case SPF_SMAX:
-      Lower = *C;
-      Upper = APInt::getSignedMaxValue(BitWidth) + 1;
-      break;
-    default:
-      break;
+  case SPF_UMIN:
+    return ConstantRange::getNonEmpty(APInt::getZero(BitWidth), *C + 1);
+  case SPF_UMAX:
+    return ConstantRange::getNonEmpty(*C, APInt::getZero(BitWidth));
+  case SPF_SMIN:
+    return ConstantRange::getNonEmpty(APInt::getSignedMinValue(BitWidth),
+                                      *C + 1);
+  case SPF_SMAX:
+    return ConstantRange::getNonEmpty(*C,
+                                      APInt::getSignedMaxValue(BitWidth) + 1);
+  default:
+    return ConstantRange::getFull(BitWidth);
   }
 }
 
@@ -8854,11 +8842,12 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
   } else if (auto *II = dyn_cast<IntrinsicInst>(V))
     CR = getRangeForIntrinsic(*II);
   else if (auto *SI = dyn_cast<SelectInst>(V)) {
-    APInt Lower = APInt(BitWidth, 0);
-    APInt Upper = APInt(BitWidth, 0);
-    // TODO: Return ConstantRange.
-    setLimitsForSelectPattern(*SI, Lower, Upper, IIQ);
-    CR = ConstantRange::getNonEmpty(Lower, Upper);
+    ConstantRange CRTrue = computeConstantRange(
+        SI->getTrueValue(), ForSigned, UseInstrInfo, AC, CtxI, DT, Depth + 1);
+    ConstantRange CRFalse = computeConstantRange(
+        SI->getFalseValue(), ForSigned, UseInstrInfo, AC, CtxI, DT, Depth + 1);
+    CR = CRTrue.unionWith(CRFalse);
+    CR = CR.intersectWith(getRangeForSelectPattern(*SI, IIQ));
   } else if (isa<FPToUIInst>(V) || isa<FPToSIInst>(V)) {
     APInt Lower = APInt(BitWidth, 0);
     APInt Upper = APInt(BitWidth, 0);
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 13bb4e83a5b94d..9893e23468e177 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1453,22 +1453,6 @@ void InterleaveGroup<Instruction>::addMetadata(Instruction *NewInst) const {
 }
 }
 
-std::string VFABI::mangleTLIVectorName(StringRef VectorName,
-                                       StringRef ScalarName, unsigned numArgs,
-                                       ElementCount VF, bool Masked) {
-  SmallString<256> Buffer;
-  llvm::raw_svector_ostream Out(Buffer);
-  Out << "_ZGV" << VFABI::_LLVM_ << (Masked ? "M" : "N");
-  if (VF.isScalable())
-    Out << 'x';
-  else
-    Out << VF.getFixedValue();
-  for (unsigned I = 0; I < numArgs; ++I)
-    Out << "v";
-  Out << "_" << ScalarName << "(" << VectorName << ")";
-  return std::string(Out.str());
-}
-
 void VFABI::getVectorVariantNames(
     const CallInst &CI, SmallVectorImpl<std::string> &VariantMappings) {
   const StringRef S = CI.getFnAttr(VFABI::MappingsAttrName).getValueAsString();
diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp
index f6613f16fafef6..9c0e657b069697 100644
--- a/llvm/lib/BinaryFormat/DXContainer.cpp
+++ b/llvm/lib/BinaryFormat/DXContainer.cpp
@@ -30,6 +30,36 @@ bool ShaderHash::isPopulated() {
   return Flags > 0 || 0 != memcmp(&Digest, &Zeros, 16);
 }
 
+#define COMPONENT_PRECISION(Val, Enum) {#Enum, SigMinPrecision::Enum},
+
+static const EnumEntry<SigMinPrecision> SigMinPrecisionNames[] = {
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<SigMinPrecision>> dxbc::getSigMinPrecisions() {
+  return ArrayRef(SigMinPrecisionNames);
+}
+
+#define D3D_SYSTEM_VALUE(Val, Enum) {#Enum, D3DSystemValue::Enum},
+
+static const EnumEntry<D3DSystemValue> D3DSystemValueNames[] = {
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<D3DSystemValue>> dxbc::getD3DSystemValues() {
+  return ArrayRef(D3DSystemValueNames);
+}
+
+#define COMPONENT_TYPE(Val, Enum) {#Enum, SigComponentType::Enum},
+
+static const EnumEntry<SigComponentType> SigComponentTypes[] = {
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+};
+
+ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() {
+  return ArrayRef(SigComponentTypes);
+}
+
 #define SEMANTIC_KIND(Val, Enum) {#Enum, PSV::SemanticKind::Enum},
 
 static const EnumEntry<PSV::SemanticKind> SemanticKindNames[] = {
diff --git a/llvm/lib/BinaryFormat/Magic.cpp b/llvm/lib/BinaryFormat/Magic.cpp
index 025334f9f3f42b..420224df57df41 100644
--- a/llvm/lib/BinaryFormat/Magic.cpp
+++ b/llvm/lib/BinaryFormat/Magic.cpp
@@ -87,6 +87,10 @@ file_magic llvm::identify_magic(StringRef Magic) {
     if (startswith(Magic, "BC\xC0\xDE"))
       return file_magic::bitcode;
     break;
+  case 'C':
+    if (startswith(Magic, "CCOB"))
+      return file_magic::offload_bundle_compressed;
+    break;
   case '!':
     if (startswith(Magic, "!<arch>\n") || startswith(Magic, "!<thin>\n"))
       return file_magic::archive;
@@ -251,6 +255,13 @@ file_magic llvm::identify_magic(StringRef Magic) {
       return file_magic::coff_object;
     break;
 
+  case '_': {
+    const char OBMagic[] = "__CLANG_OFFLOAD_BUNDLE__";
+    if (Magic.size() >= sizeof(OBMagic) && startswith(Magic, OBMagic))
+      return file_magic::offload_bundle;
+    break;
+  }
+
   default:
     break;
   }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index e56291859022ee..1d1ec988a93d84 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4683,7 +4683,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       case bitc::METADATA_BLOCK_ID:
         assert(DeferredMetadataInfo.empty() &&
                "Must read all module-level metadata before function-level");
-        if (Error Err = MDLoader->parseFunctionMetadata(CurBB))
+        if (Error Err = MDLoader->parseFunctionMetadata())
           return Err;
         break;
       case bitc::USELIST_BLOCK_ID:
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 74ccb2dd8f3261..1e9ed5fcaa581b 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -473,8 +473,7 @@ class MetadataLoader::MetadataLoaderImpl {
 
   Error parseOneMetadata(SmallVectorImpl<uint64_t> &Record, unsigned Code,
                          PlaceholderQueue &Placeholders, StringRef Blob,
-                         unsigned &NextMetadataNo,
-                         BasicBlock *ConstExprInsertBB);
+                         unsigned &NextMetadataNo);
   Error parseMetadataStrings(ArrayRef<uint64_t> Record, StringRef Blob,
                              function_ref<void(StringRef)> CallBack);
   Error parseGlobalObjectAttachment(GlobalObject &GO,
@@ -723,7 +722,7 @@ class MetadataLoader::MetadataLoaderImpl {
         TheModule(TheModule), Callbacks(std::move(Callbacks)),
         IsImporting(IsImporting) {}
 
-  Error parseMetadata(bool ModuleLevel, BasicBlock *ConstExprInsertBB);
+  Error parseMetadata(bool ModuleLevel);
 
   bool hasFwdRefs() const { return MetadataList.hasFwdRefs(); }
 
@@ -1048,8 +1047,7 @@ void MetadataLoader::MetadataLoaderImpl::callMDTypeCallback(Metadata **Val,
 
 /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
 /// module level metadata.
-Error MetadataLoader::MetadataLoaderImpl::parseMetadata(
-    bool ModuleLevel, BasicBlock *ConstExprInsertBB) {
+Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
   if (!ModuleLevel && MetadataList.hasFwdRefs())
     return error("Invalid metadata: fwd refs into function blocks");
 
@@ -1132,7 +1130,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(
     if (Expected<unsigned> MaybeCode =
             Stream.readRecord(Entry.ID, Record, &Blob)) {
       if (Error Err = parseOneMetadata(Record, MaybeCode.get(), Placeholders,
-                                       Blob, NextMetadataNo, ConstExprInsertBB))
+                                       Blob, NextMetadataNo))
         return Err;
     } else
       return MaybeCode.takeError();
@@ -1173,8 +1171,7 @@ void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
   if (Expected<unsigned> MaybeCode =
           IndexCursor.readRecord(Entry.ID, Record, &Blob)) {
     if (Error Err =
-            parseOneMetadata(Record, MaybeCode.get(), Placeholders, Blob, ID,
-                             /* ConstExprInsertBB */ nullptr))
+            parseOneMetadata(Record, MaybeCode.get(), Placeholders, Blob, ID))
       report_fatal_error("Can't lazyload MD, parseOneMetadata: " +
                          Twine(toString(std::move(Err))));
   } else
@@ -1216,10 +1213,29 @@ void MetadataLoader::MetadataLoaderImpl::resolveForwardRefsAndPlaceholders(
   Placeholders.flush(MetadataList);
 }
 
+static Value *getValueFwdRef(BitcodeReaderValueList &ValueList, unsigned Idx,
+                             Type *Ty, unsigned TyID) {
+  Value *V = ValueList.getValueFwdRef(Idx, Ty, TyID,
+                                      /*ConstExprInsertBB*/ nullptr);
+  if (V)
+    return V;
+
+  // This is a reference to a no longer supported constant expression.
+  // Pretend that the constant was deleted, which will replace metadata
+  // references with undef.
+  // TODO: This is a rather indirect check. It would be more elegant to use
+  // a separate ErrorInfo for constant materialization failure and thread
+  // the error reporting through getValueFwdRef().
+  if (Idx < ValueList.size() && ValueList[Idx] &&
+      ValueList[Idx]->getType() == Ty)
+    return UndefValue::get(Ty);
+
+  return nullptr;
+}
+
 Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     SmallVectorImpl<uint64_t> &Record, unsigned Code,
-    PlaceholderQueue &Placeholders, StringRef Blob, unsigned &NextMetadataNo,
-    BasicBlock *ConstExprInsertBB) {
+    PlaceholderQueue &Placeholders, StringRef Blob, unsigned &NextMetadataNo) {
 
   bool IsDistinct = false;
   auto getMD = [&](unsigned ID) -> Metadata * {
@@ -1348,8 +1364,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       if (Ty->isMetadataTy())
         Elts.push_back(getMD(Record[i + 1]));
       else if (!Ty->isVoidTy()) {
-        Value *V = ValueList.getValueFwdRef(Record[i + 1], Ty, TyID,
-                                            /*ConstExprInsertBB*/ nullptr);
+        Value *V = getValueFwdRef(ValueList, Record[i + 1], Ty, TyID);
         if (!V)
           return error("Invalid value reference from old metadata");
         Metadata *MD = ValueAsMetadata::get(V);
@@ -1373,7 +1388,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (!Ty || Ty->isMetadataTy() || Ty->isVoidTy())
       return error("Invalid record");
 
-    Value *V = ValueList.getValueFwdRef(Record[1], Ty, TyID, ConstExprInsertBB);
+    Value *V = getValueFwdRef(ValueList, Record[1], Ty, TyID);
     if (!V)
       return error("Invalid value reference from metadata");
 
@@ -2462,9 +2477,8 @@ MetadataLoader::MetadataLoader(BitstreamCursor &Stream, Module &TheModule,
     : Pimpl(std::make_unique<MetadataLoaderImpl>(
           Stream, TheModule, ValueList, std::move(Callbacks), IsImporting)) {}
 
-Error MetadataLoader::parseMetadata(bool ModuleLevel,
-                                    BasicBlock *ConstExprInsertBB) {
-  return Pimpl->parseMetadata(ModuleLevel, ConstExprInsertBB);
+Error MetadataLoader::parseMetadata(bool ModuleLevel) {
+  return Pimpl->parseMetadata(ModuleLevel);
 }
 
 bool MetadataLoader::hasFwdRefs() const { return Pimpl->hasFwdRefs(); }
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.h b/llvm/lib/Bitcode/Reader/MetadataLoader.h
index 3bbe51b2f70b25..bab855ca635910 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.h
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.h
@@ -48,8 +48,7 @@ struct MetadataLoaderCallbacks {
 class MetadataLoader {
   class MetadataLoaderImpl;
   std::unique_ptr<MetadataLoaderImpl> Pimpl;
-  Error parseMetadata(bool ModuleLevel,
-                      BasicBlock *ConstExprInsertBB = nullptr);
+  Error parseMetadata(bool ModuleLevel);
 
 public:
   ~MetadataLoader();
@@ -63,9 +62,7 @@ class MetadataLoader {
   Error parseModuleMetadata() { return parseMetadata(true); }
 
   // Parse a function metadata block
-  Error parseFunctionMetadata(BasicBlock *ConstExprInsertBB) {
-    return parseMetadata(false, ConstExprInsertBB);
-  }
+  Error parseFunctionMetadata() { return parseMetadata(false); }
 
   /// Set the mode to strip TBAA metadata on load.
   void setStripTBAA(bool StripTBAA = true);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e31b08df7dbbe8..371f6598e6b2b3 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2591,9 +2591,8 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
     (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
     assert(!VerifyBFIUpdates ||
            BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));
-    BFI->setBlockFreq(
-        BB,
-        (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());
+    BFI->setBlockFreq(BB,
+                      (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)));
     ModifiedDT = ModifyDT::ModifyBBDT;
     Changed = true;
     ++NumRetsDup;
@@ -7067,7 +7066,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
     FreshBBs.insert(EndBlock);
   }
 
-  BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
+  BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
 
   static const unsigned MD[] = {
       LLVMContext::MD_prof, LLVMContext::MD_unpredictable,
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 1eba979f83db0c..c6d7827f36dfd6 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -18,8 +18,10 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/TargetParser/Triple.h"
@@ -732,3 +734,24 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
   for (Function &F : M)
     setFunctionAttributes(CPU, Features, F);
 }
+
+Expected<std::unique_ptr<TargetMachine>>
+codegen::createTargetMachineForTriple(StringRef TargetTriple,
+                                      CodeGenOptLevel OptLevel) {
+  Triple TheTriple(TargetTriple);
+  std::string Error;
+  const auto *TheTarget =
+      TargetRegistry::lookupTarget(codegen::getMArch(), TheTriple, Error);
+  if (!TheTarget)
+    return createStringError(inconvertibleErrorCode(), Error);
+  auto *Target = TheTarget->createTargetMachine(
+      TheTriple.getTriple(), codegen::getCPUStr(), codegen::getFeaturesStr(),
+      codegen::InitTargetOptionsFromCodeGenFlags(TheTriple),
+      codegen::getExplicitRelocModel(), codegen::getExplicitCodeModel(),
+      OptLevel);
+  if (!Target)
+    return createStringError(inconvertibleErrorCode(),
+                             Twine("could not allocate target machine for ") +
+                                 TargetTriple);
+  return std::unique_ptr<TargetMachine>(Target);
+}
diff --git a/llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp b/llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp
index e261531eb01405..598be26e40c8e9 100644
--- a/llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp
+++ b/llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp
@@ -58,7 +58,7 @@ bool GCEmptyBasicBlocks::runOnMachineFunction(MachineFunction &MF) {
     // TODO If a block is an eh pad, or it has address taken, we don't remove
     // it. Removing such blocks is possible, but it probably requires a more
     // complex logic.
-    if (MBB->isEHPad() || MBB->isMachineBlockAddressTaken())
+    if (MBB->isEHPad() || MBB->hasAddressTaken())
       continue;
     // Skip blocks with real code.
     bool HasAnyRealCode = llvm::any_of(*MBB, [](const MachineInstr &MI) {
diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index d201342cd61dbc..bb5363fb2527b5 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -449,7 +449,8 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
     return MappingCost::ImpossibleCost();
 
   // If mapped with InstrMapping, MI will have the recorded cost.
-  MappingCost Cost(MBFI ? MBFI->getBlockFreq(MI.getParent()) : 1);
+  MappingCost Cost(MBFI ? MBFI->getBlockFreq(MI.getParent())
+                        : BlockFrequency(1));
   bool Saturated = Cost.addLocalCost(InstrMapping.getCost());
   assert(!Saturated && "Possible mapping saturated the cost");
   LLVM_DEBUG(dbgs() << "Evaluating mapping cost for: " << MI);
@@ -971,7 +972,7 @@ bool RegBankSelect::EdgeInsertPoint::canMaterialize() const {
   return Src.canSplitCriticalEdge(DstOrSplit);
 }
 
-RegBankSelect::MappingCost::MappingCost(const BlockFrequency &LocalFreq)
+RegBankSelect::MappingCost::MappingCost(BlockFrequency LocalFreq)
     : LocalFreq(LocalFreq.getFrequency()) {}
 
 bool RegBankSelect::MappingCost::addLocalCost(uint64_t Cost) {
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 1636a225d672a9..c62f3db9d32156 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -495,31 +495,6 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   return true;
 }
 
-/// Check if all subranges in \p LI and \p SLI have the same value number at \p
-/// Idx.
-static bool allSubRangeValNoSame(const LiveInterval &LI,
-                                 const LiveInterval &SLI,
-                                 const MachineInstr &MI,
-                                 const MachineRegisterInfo &MRI,
-                                 const TargetRegisterInfo &TRI, SlotIndex Idx) {
-  for (auto &SR : SLI.subranges()) {
-    VNInfo *SubVNI = SR.getVNInfoAt(Idx);
-
-    for (auto &SubLI : LI.subranges()) {
-      if (SubLI.LaneMask == SR.LaneMask) {
-        if (SubVNI != SubLI.getVNInfoAt(Idx))
-          return false;
-      } else if ((SubLI.LaneMask & SR.LaneMask).any()) {
-        // TODO: Check non-exact, overlapping subranges if they share the same
-        // def instruction
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
 /// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any
 /// redundant spills of this value in SLI.reg and sibling copies.
 void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
@@ -549,13 +524,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
       if (!MI.mayStore() && !TII.isCopyInstr(MI))
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
-
-      // The main range value numbers will differ if multiple instructions are
-      // used to define its various subregisters. Check the subregister value
-      // numbers as a fallback.
-      if (LI->getVNInfoAt(Idx) != VNI &&
-          (!SLI.hasSubRanges() ||
-           !allSubRangeValNoSame(*LI, SLI, MI, MRI, TRI, Idx)))
+      if (LI->getVNInfoAt(Idx) != VNI)
         continue;
 
       // Follow sibling copies down the dominator tree.
diff --git a/llvm/lib/CodeGen/MBFIWrapper.cpp b/llvm/lib/CodeGen/MBFIWrapper.cpp
index 5b388be2783946..e9f16329b57f75 100644
--- a/llvm/lib/CodeGen/MBFIWrapper.cpp
+++ b/llvm/lib/CodeGen/MBFIWrapper.cpp
@@ -38,25 +38,13 @@ MBFIWrapper::getBlockProfileCount(const MachineBasicBlock *MBB) const {
   // Modified block frequency also impacts profile count. So we should compute
   // profile count from new block frequency if it has been changed.
   if (I != MergedBBFreq.end())
-    return MBFI.getProfileCountFromFreq(I->second.getFrequency());
+    return MBFI.getProfileCountFromFreq(I->second);
 
   return MBFI.getBlockProfileCount(MBB);
 }
 
-raw_ostream & MBFIWrapper::printBlockFreq(raw_ostream &OS,
-                                          const MachineBasicBlock *MBB) const {
-  return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
-}
-
-raw_ostream & MBFIWrapper::printBlockFreq(raw_ostream &OS,
-                                          const BlockFrequency Freq) const {
-  return MBFI.printBlockFreq(OS, Freq);
-}
-
 void MBFIWrapper::view(const Twine &Name, bool isSimple) {
   MBFI.view(Name, isSimple);
 }
 
-uint64_t MBFIWrapper::getEntryFreq() const {
-  return MBFI.getEntryFreq();
-}
+BlockFrequency MBFIWrapper::getEntryFreq() const { return MBFI.getEntryFreq(); }
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index b1cbe525d7e6c1..7ee72e21442630 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -75,7 +75,7 @@ static cl::opt<bool> PrintMachineBlockFreq(
 
 // Command line option to specify the name of the function for block frequency
 // dump. Defined in Analysis/BlockFrequencyInfo.cpp.
-extern cl::opt<std::string> PrintBlockFreqFuncName;
+extern cl::opt<std::string> PrintBFIFuncName;
 } // namespace llvm
 
 static GVDAGType getGVDT() {
@@ -203,8 +203,7 @@ void MachineBlockFrequencyInfo::calculate(
     view("MachineBlockFrequencyDAGS." + F.getName());
   }
   if (PrintMachineBlockFreq &&
-      (PrintBlockFreqFuncName.empty() ||
-       F.getName().equals(PrintBlockFreqFuncName))) {
+      (PrintBFIFuncName.empty() || F.getName().equals(PrintBFIFuncName))) {
     MBFI->print(dbgs());
   }
 }
@@ -228,7 +227,7 @@ void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const {
 
 BlockFrequency
 MachineBlockFrequencyInfo::getBlockFreq(const MachineBasicBlock *MBB) const {
-  return MBFI ? MBFI->getBlockFreq(MBB) : 0;
+  return MBFI ? MBFI->getBlockFreq(MBB) : BlockFrequency(0);
 }
 
 std::optional<uint64_t> MachineBlockFrequencyInfo::getBlockProfileCount(
@@ -241,7 +240,7 @@ std::optional<uint64_t> MachineBlockFrequencyInfo::getBlockProfileCount(
 }
 
 std::optional<uint64_t>
-MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
+MachineBlockFrequencyInfo::getProfileCountFromFreq(BlockFrequency Freq) const {
   if (!MBFI)
     return std::nullopt;
 
@@ -263,7 +262,7 @@ void MachineBlockFrequencyInfo::onEdgeSplit(
   auto NewSuccFreq = MBFI->getBlockFreq(&NewPredecessor) *
                      MBPI.getEdgeProbability(&NewPredecessor, &NewSuccessor);
 
-  MBFI->setBlockFreq(&NewSuccessor, NewSuccFreq.getFrequency());
+  MBFI->setBlockFreq(&NewSuccessor, NewSuccFreq);
 }
 
 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
@@ -274,18 +273,18 @@ const MachineBranchProbabilityInfo *MachineBlockFrequencyInfo::getMBPI() const {
   return MBFI ? &MBFI->getBPI() : nullptr;
 }
 
-raw_ostream &
-MachineBlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
-                                          const BlockFrequency Freq) const {
-  return MBFI ? MBFI->printBlockFreq(OS, Freq) : OS;
+BlockFrequency MachineBlockFrequencyInfo::getEntryFreq() const {
+  return MBFI ? MBFI->getEntryFreq() : BlockFrequency(0);
 }
 
-raw_ostream &
-MachineBlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
-                                          const MachineBasicBlock *MBB) const {
-  return MBFI ? MBFI->printBlockFreq(OS, MBB) : OS;
+Printable llvm::printBlockFreq(const MachineBlockFrequencyInfo &MBFI,
+                               BlockFrequency Freq) {
+  return Printable([&MBFI, Freq](raw_ostream &OS) {
+    printBlockFreqImpl(OS, MBFI.getEntryFreq(), Freq);
+  });
 }
 
-uint64_t MachineBlockFrequencyInfo::getEntryFreq() const {
-  return MBFI ? MBFI->getEntryFreq() : 0;
+Printable llvm::printBlockFreq(const MachineBlockFrequencyInfo &MBFI,
+                               const MachineBasicBlock &MBB) {
+  return printBlockFreq(MBFI, MBFI.getBlockFreq(&MBB));
 }
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 603c0e9600afc3..d0d3574b30bfd8 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -444,9 +444,9 @@ class MachineBlockPlacement : public MachineFunctionPass {
     if (UseProfileCount) {
       auto Count = MBFI->getBlockProfileCount(BB);
       if (Count)
-        return *Count;
+        return BlockFrequency(*Count);
       else
-        return 0;
+        return BlockFrequency(0);
     } else
       return MBFI->getBlockFreq(BB);
   }
@@ -795,10 +795,10 @@ bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
 /// penalty is less than 100%
 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
 static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
-                            uint64_t EntryFreq) {
+                            BlockFrequency EntryFreq) {
   BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
   BlockFrequency Gain = A - B;
-  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
+  return (Gain / ThresholdProb) >= EntryFreq;
 }
 
 /// Check the edge frequencies to see if tail duplication will increase
@@ -843,7 +843,7 @@ bool MachineBlockPlacement::isProfitableToTailDup(
   auto SuccFreq = MBFI->getBlockFreq(Succ);
   BlockFrequency P = BBFreq * PProb;
   BlockFrequency Qout = BBFreq * QProb;
-  uint64_t EntryFreq = MBFI->getEntryFreq();
+  BlockFrequency EntryFreq = MBFI->getEntryFreq();
   // If there are no more successors, it is profitable to copy, as it strictly
   // increases fallthrough.
   if (SuccSuccs.size() == 0)
@@ -1729,8 +1729,9 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
            "Found CFG-violating block");
 
     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
-    LLVM_DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
-               MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
+    LLVM_DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> "
+                      << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
+                      << " (freq)\n");
 
     // For ehpad, we layout the least probable first as to avoid jumping back
     // from least probable landingpads to more probable ones.
@@ -1927,7 +1928,7 @@ BlockFrequency
 MachineBlockPlacement::TopFallThroughFreq(
     const MachineBasicBlock *Top,
     const BlockFilterSet &LoopBlockSet) {
-  BlockFrequency MaxFreq = 0;
+  BlockFrequency MaxFreq = BlockFrequency(0);
   for (MachineBasicBlock *Pred : Top->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
     if (!LoopBlockSet.count(Pred) &&
@@ -1986,7 +1987,7 @@ MachineBlockPlacement::FallThroughGains(
     const MachineBasicBlock *ExitBB,
     const BlockFilterSet &LoopBlockSet) {
   BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
-  BlockFrequency FallThrough2Exit = 0;
+  BlockFrequency FallThrough2Exit = BlockFrequency(0);
   if (ExitBB)
     FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
         MBPI->getEdgeProbability(NewTop, ExitBB);
@@ -1994,58 +1995,58 @@ MachineBlockPlacement::FallThroughGains(
       MBPI->getEdgeProbability(NewTop, OldTop);
 
   // Find the best Pred of NewTop.
-   MachineBasicBlock *BestPred = nullptr;
-   BlockFrequency FallThroughFromPred = 0;
-   for (MachineBasicBlock *Pred : NewTop->predecessors()) {
-     if (!LoopBlockSet.count(Pred))
-       continue;
-     BlockChain *PredChain = BlockToChain[Pred];
-     if (!PredChain || Pred == *std::prev(PredChain->end())) {
-       BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
-           MBPI->getEdgeProbability(Pred, NewTop);
-       if (EdgeFreq > FallThroughFromPred) {
-         FallThroughFromPred = EdgeFreq;
-         BestPred = Pred;
-       }
-     }
-   }
-
-   // If NewTop is not placed after Pred, another successor can be placed
-   // after Pred.
-   BlockFrequency NewFreq = 0;
-   if (BestPred) {
-     for (MachineBasicBlock *Succ : BestPred->successors()) {
-       if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
-         continue;
-       if (ComputedEdges.contains(Succ))
-         continue;
-       BlockChain *SuccChain = BlockToChain[Succ];
-       if ((SuccChain && (Succ != *SuccChain->begin())) ||
-           (SuccChain == BlockToChain[BestPred]))
-         continue;
-       BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
-           MBPI->getEdgeProbability(BestPred, Succ);
-       if (EdgeFreq > NewFreq)
-         NewFreq = EdgeFreq;
-     }
-     BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
-         MBPI->getEdgeProbability(BestPred, NewTop);
-     if (NewFreq > OrigEdgeFreq) {
-       // If NewTop is not the best successor of Pred, then Pred doesn't
-       // fallthrough to NewTop. So there is no FallThroughFromPred and
-       // NewFreq.
-       NewFreq = 0;
-       FallThroughFromPred = 0;
-     }
-   }
-
-   BlockFrequency Result = 0;
-   BlockFrequency Gains = BackEdgeFreq + NewFreq;
-   BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
-       FallThroughFromPred;
-   if (Gains > Lost)
-     Result = Gains - Lost;
-   return Result;
+  MachineBasicBlock *BestPred = nullptr;
+  BlockFrequency FallThroughFromPred = BlockFrequency(0);
+  for (MachineBasicBlock *Pred : NewTop->predecessors()) {
+    if (!LoopBlockSet.count(Pred))
+      continue;
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (!PredChain || Pred == *std::prev(PredChain->end())) {
+      BlockFrequency EdgeFreq =
+          MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
+      if (EdgeFreq > FallThroughFromPred) {
+        FallThroughFromPred = EdgeFreq;
+        BestPred = Pred;
+      }
+    }
+  }
+
+  // If NewTop is not placed after Pred, another successor can be placed
+  // after Pred.
+  BlockFrequency NewFreq = BlockFrequency(0);
+  if (BestPred) {
+    for (MachineBasicBlock *Succ : BestPred->successors()) {
+      if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
+        continue;
+      if (ComputedEdges.contains(Succ))
+        continue;
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if ((SuccChain && (Succ != *SuccChain->begin())) ||
+          (SuccChain == BlockToChain[BestPred]))
+        continue;
+      BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
+                                MBPI->getEdgeProbability(BestPred, Succ);
+      if (EdgeFreq > NewFreq)
+        NewFreq = EdgeFreq;
+    }
+    BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
+                                  MBPI->getEdgeProbability(BestPred, NewTop);
+    if (NewFreq > OrigEdgeFreq) {
+      // If NewTop is not the best successor of Pred, then Pred doesn't
+      // fallthrough to NewTop. So there is no FallThroughFromPred and
+      // NewFreq.
+      NewFreq = BlockFrequency(0);
+      FallThroughFromPred = BlockFrequency(0);
+    }
+  }
+
+  BlockFrequency Result = BlockFrequency(0);
+  BlockFrequency Gains = BackEdgeFreq + NewFreq;
+  BlockFrequency Lost =
+      FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
+  if (Gains > Lost)
+    Result = Gains - Lost;
+  return Result;
 }
 
 /// Helper function of findBestLoopTop. Find the best loop top block
@@ -2087,7 +2088,7 @@ MachineBlockPlacement::findBestLoopTopHelper(
   LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
                     << "\n");
 
-  BlockFrequency BestGains = 0;
+  BlockFrequency BestGains = BlockFrequency(0);
   MachineBasicBlock *BestPred = nullptr;
   for (MachineBasicBlock *Pred : OldTop->predecessors()) {
     if (!LoopBlockSet.count(Pred))
@@ -2095,8 +2096,8 @@ MachineBlockPlacement::findBestLoopTopHelper(
     if (Pred == L.getHeader())
       continue;
     LLVM_DEBUG(dbgs() << "   old top pred: " << getBlockName(Pred) << ", has "
-                      << Pred->succ_size() << " successors, ";
-               MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
+                      << Pred->succ_size() << " successors, "
+                      << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
     if (Pred->succ_size() > 2)
       continue;
 
@@ -2112,8 +2113,9 @@ MachineBlockPlacement::findBestLoopTopHelper(
 
     BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
                                             LoopBlockSet);
-    if ((Gains > 0) && (Gains > BestGains ||
-        ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
+    if ((Gains > BlockFrequency(0)) &&
+        (Gains > BestGains ||
+         ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
       BestPred = Pred;
       BestGains = Gains;
     }
@@ -2239,10 +2241,10 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
       }
 
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
-      LLVM_DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
-                        << getBlockName(Succ) << " [L:" << SuccLoopDepth
-                        << "] (";
-                 MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
+      LLVM_DEBUG(
+          dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
+                 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
+                 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
       // Note that we bias this toward an existing layout successor to retain
       // incoming order in the absence of better information. The exit must have
       // a frequency higher than the current exit before we consider breaking
@@ -2425,14 +2427,14 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   if (ChainHeaderBB->isEntryBlock())
     return;
 
-  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
+  BlockFrequency SmallestRotationCost = BlockFrequency::max();
 
   // A utility lambda that scales up a block frequency by dividing it by a
   // branch probability which is the reciprocal of the scale.
   auto ScaleBlockFrequency = [](BlockFrequency Freq,
                                 unsigned Scale) -> BlockFrequency {
     if (Scale == 0)
-      return 0;
+      return BlockFrequency(0);
     // Use operator / between BlockFrequency and BranchProbability to implement
     // saturating multiplication.
     return Freq / BranchProbability(1, Scale);
@@ -2492,7 +2494,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
     auto TailBB = *TailIter;
 
     // Calculate the cost by putting this BB to the top.
-    BlockFrequency Cost = 0;
+    BlockFrequency Cost = BlockFrequency(0);
 
     // If the current BB is the loop header, we need to take into account the
     // cost of the missed fall through edge from outside of the loop to the
@@ -2523,8 +2525,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
     if (TailBB->isSuccessor(*Iter)) {
       auto TailBBFreq = MBFI->getBlockFreq(TailBB);
       if (TailBB->succ_size() == 1)
-        Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
-                                    MisfetchCost + JumpInstCost);
+        Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
       else if (TailBB->succ_size() == 2) {
         auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
         auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
@@ -2537,8 +2538,8 @@ void MachineBlockPlacement::rotateLoopWithProfile(
     }
 
     LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
-                      << getBlockName(*Iter)
-                      << " to the top: " << Cost.getFrequency() << "\n");
+                      << getBlockName(*Iter) << " to the top: "
+                      << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
 
     if (Cost < SmallestRotationCost) {
       SmallestRotationCost = Cost;
@@ -3159,7 +3160,7 @@ static uint64_t countMBBInstruction(MachineBasicBlock *MBB) {
 // So we should scale the threshold accordingly. But the instruction size is not
 // available on all targets, so we use the number of instructions instead.
 BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
-  return DupThreshold.getFrequency() * countMBBInstruction(BB);
+  return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
 }
 
 // Returns true if BB is Pred's best successor.
@@ -3313,7 +3314,7 @@ void MachineBlockPlacement::findDuplicateCandidates(
 }
 
 void MachineBlockPlacement::initDupThreshold() {
-  DupThreshold = 0;
+  DupThreshold = BlockFrequency(0);
   if (!F->getFunction().hasProfileData())
     return;
 
@@ -3321,12 +3322,13 @@ void MachineBlockPlacement::initDupThreshold() {
   uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
   if (HotThreshold != UINT64_MAX) {
     UseProfileCount = true;
-    DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
+    DupThreshold =
+        BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
     return;
   }
 
   // Profile count is not available, we can use block frequency instead.
-  BlockFrequency MaxFreq = 0;
+  BlockFrequency MaxFreq = BlockFrequency(0);
   for (MachineBasicBlock &MBB : *F) {
     BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
     if (Freq > MaxFreq)
@@ -3334,7 +3336,7 @@ void MachineBlockPlacement::initDupThreshold() {
   }
 
   BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
-  DupThreshold = MaxFreq * ThresholdProb;
+  DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
   UseProfileCount = false;
 }
 
diff --git a/llvm/lib/CodeGen/MachineFunctionPass.cpp b/llvm/lib/CodeGen/MachineFunctionPass.cpp
index 3a1e1720be9c62..d57a912f418b72 100644
--- a/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -88,6 +88,8 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
     MF.print(OS);
   }
 
+  MFProps.reset(ClearedProperties);
+
   bool RV = runOnMachineFunction(MF);
 
   if (ShouldEmitSizeRemarks) {
@@ -114,7 +116,6 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
   }
 
   MFProps.set(SetProperties);
-  MFProps.reset(ClearedProperties);
 
   // For --print-changed, print if the serialized MF has changed. Modes other
   // than quiet/verbose are unimplemented and treated the same as 'quiet'.
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 9d4e0c647048f5..073c9a082d1263 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -528,7 +528,10 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI,
           continue;
         MachineInstr *NewDbgMI = SinkDst->getMF()->CloneMachineInstr(DbgMI);
         SinkMBB.insertAfter(InsertPt, NewDbgMI);
-        NewDbgMI->getOperand(0).setReg(DstReg);
+        for (auto &SrcMO : DbgMI->getDebugOperandsForReg(DefReg)) {
+          auto &DstMO = NewDbgMI->getOperand(SrcMO.getOperandNo());
+          DstMO.setReg(DstReg);
+        }
       }
     } else {
       // Fold instruction into the addressing mode of a memory instruction.
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 248cc1ac0ee2e1..4d476924a7dbf7 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -607,7 +607,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
 
   // Reset interference dependent info.
   SplitConstraints.resize(UseBlocks.size());
-  BlockFrequency StaticCost = 0;
+  BlockFrequency StaticCost = BlockFrequency(0);
   for (unsigned I = 0; I != UseBlocks.size(); ++I) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[I];
     SpillPlacement::BlockConstraint &BC = SplitConstraints[I];
@@ -832,7 +832,7 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
 /// calcSpillCost - Compute how expensive it would be to split the live range in
 /// SA around all use blocks instead of forming bundle regions.
 BlockFrequency RAGreedy::calcSpillCost() {
-  BlockFrequency Cost = 0;
+  BlockFrequency Cost = BlockFrequency(0);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
     unsigned Number = BI.MBB->getNumber();
@@ -852,7 +852,7 @@ BlockFrequency RAGreedy::calcSpillCost() {
 ///
 BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
                                              const AllocationOrder &Order) {
-  BlockFrequency GlobalCost = 0;
+  BlockFrequency GlobalCost = BlockFrequency(0);
   const BitVector &LiveBundles = Cand.LiveBundles;
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned I = 0; I != UseBlocks.size(); ++I) {
@@ -1056,13 +1056,13 @@ MCRegister RAGreedy::tryRegionSplit(const LiveInterval &VirtReg,
   if (HasCompact) {
     // Yes, keep GlobalCand[0] as the compact region candidate.
     NumCands = 1;
-    BestCost = BlockFrequency::getMaxFrequency();
+    BestCost = BlockFrequency::max();
   } else {
     // No benefit from the compact region, our fallback will be per-block
     // splitting. Make sure we find a solution that is cheaper than spilling.
     BestCost = SpillCost;
-    LLVM_DEBUG(dbgs() << "Cost of isolating all blocks = ";
-               MBFI->printBlockFreq(dbgs(), BestCost) << '\n');
+    LLVM_DEBUG(dbgs() << "Cost of isolating all blocks = "
+                      << printBlockFreq(*MBFI, BestCost) << '\n');
   }
 
   unsigned BestCand = calculateRegionSplitCost(VirtReg, Order, BestCost,
@@ -1112,8 +1112,8 @@ RAGreedy::calculateRegionSplitCostAroundReg(MCPhysReg PhysReg,
     LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tno positive bundles\n");
     return BestCand;
   }
-  LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tstatic = ";
-             MBFI->printBlockFreq(dbgs(), Cost));
+  LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI)
+                    << "\tstatic = " << printBlockFreq(*MBFI, Cost));
   if (Cost >= BestCost) {
     LLVM_DEBUG({
       if (BestCand == NoCand)
@@ -1139,8 +1139,7 @@ RAGreedy::calculateRegionSplitCostAroundReg(MCPhysReg PhysReg,
 
   Cost += calcGlobalSplitCost(Cand, Order);
   LLVM_DEBUG({
-    dbgs() << ", total = ";
-    MBFI->printBlockFreq(dbgs(), Cost) << " with bundles";
+    dbgs() << ", total = " << printBlockFreq(*MBFI, Cost) << " with bundles";
     for (int I : Cand.LiveBundles.set_bits())
       dbgs() << " EB#" << I;
     dbgs() << ".\n";
@@ -1222,7 +1221,7 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint,
   if (ExtraInfo->getStage(VirtReg) >= RS_Split2)
     return false;
 
-  BlockFrequency Cost = 0;
+  BlockFrequency Cost = BlockFrequency(0);
   Register Reg = VirtReg.reg();
 
   // Compute the cost of assigning a non Hint physical register to VirtReg.
@@ -1249,7 +1248,7 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint,
   // Decrease the cost so it will be split in colder blocks.
   BranchProbability Threshold(SplitThresholdForRegWithHint, 100);
   Cost *= Threshold;
-  if (Cost == 0)
+  if (Cost == BlockFrequency(0))
     return false;
 
   unsigned NumCands = 0;
@@ -1630,8 +1629,8 @@ unsigned RAGreedy::tryLocalSplit(const LiveInterval &VirtReg,
   float BestDiff = 0;
 
   const float blockFreq =
-    SpillPlacer->getBlockFrequency(BI.MBB->getNumber()).getFrequency() *
-    (1.0f / MBFI->getEntryFreq());
+      SpillPlacer->getBlockFrequency(BI.MBB->getNumber()).getFrequency() *
+      (1.0f / MBFI->getEntryFreq().getFrequency());
   SmallVector<float, 8> GapWeight;
 
   for (MCPhysReg PhysReg : Order) {
@@ -2199,9 +2198,9 @@ void RAGreedy::initializeCSRCost() {
     return;
 
   // Raw cost is relative to Entry == 2^14; scale it appropriately.
-  uint64_t ActualEntry = MBFI->getEntryFreq();
+  uint64_t ActualEntry = MBFI->getEntryFreq().getFrequency();
   if (!ActualEntry) {
-    CSRCost = 0;
+    CSRCost = BlockFrequency(0);
     return;
   }
   uint64_t FixedEntry = 1 << 14;
@@ -2212,7 +2211,8 @@ void RAGreedy::initializeCSRCost() {
     CSRCost /= BranchProbability(FixedEntry, ActualEntry);
   else
     // Can't use BranchProbability in general, since it takes 32-bit numbers.
-    CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
+    CSRCost =
+        BlockFrequency(CSRCost.getFrequency() * (ActualEntry / FixedEntry));
 }
 
 /// Collect the hint info for \p Reg.
@@ -2243,7 +2243,7 @@ void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
 /// \return The cost of \p List for \p PhysReg.
 BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
                                            MCRegister PhysReg) {
-  BlockFrequency Cost = 0;
+  BlockFrequency Cost = BlockFrequency(0);
   for (const HintInfo &Info : List) {
     if (Info.PhysReg != PhysReg)
       Cost += Info.Freq;
@@ -2312,9 +2312,9 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
       LLVM_DEBUG(dbgs() << "Checking profitability:\n");
       BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
       BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
-      LLVM_DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
-                        << "\nNew Cost: " << NewCopiesCost.getFrequency()
-                        << '\n');
+      LLVM_DEBUG(dbgs() << "Old Cost: " << printBlockFreq(*MBFI, OldCopiesCost)
+                        << "\nNew Cost: "
+                        << printBlockFreq(*MBFI, NewCopiesCost) << '\n');
       if (OldCopiesCost < NewCopiesCost) {
         LLVM_DEBUG(dbgs() << "=> Not profitable.\n");
         continue;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 238462113bfe78..7e5ce300370c92 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1317,6 +1317,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (SrcIdx && DstIdx)
     return false;
 
+  [[maybe_unused]] const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
   const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
   if (!DefMI->isImplicitDef()) {
     if (DstReg.isPhysical()) {
@@ -1396,7 +1397,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     MachineOperand &MO = CopyMI->getOperand(I);
     if (MO.isReg()) {
       assert(MO.isImplicit() && "No explicit operands after implicit operands.");
-      assert(MO.getReg().isPhysical() && "unexpected implicit virtual register def");
+      assert((MO.getReg().isPhysical() ||
+              (MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) &&
+             "unexpected implicit virtual register def");
       ImplicitOps.push_back(MO);
     }
   }
@@ -1407,14 +1410,43 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   // NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
   // We need to remember these so we can add intervals once we insert
   // NewMI into SlotIndexes.
+  //
+  // We also expect to have tied implicit-defs of super registers originating
+  // from SUBREG_TO_REG, such as:
+  // $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+  // undef %0.sub_32bit = MOV32r0 implicit-def dead $eflags, implicit-def %0
+
+  bool NewMIDefinesFullReg = false;
+
   SmallVector<MCRegister, 4> NewMIImplDefs;
   for (unsigned i = NewMI.getDesc().getNumOperands(),
                 e = NewMI.getNumOperands();
        i != e; ++i) {
     MachineOperand &MO = NewMI.getOperand(i);
     if (MO.isReg() && MO.isDef()) {
-      assert(MO.isImplicit() && MO.isDead() && MO.getReg().isPhysical());
-      NewMIImplDefs.push_back(MO.getReg().asMCReg());
+      assert(MO.isImplicit());
+      if (MO.getReg().isPhysical()) {
+        if (MO.getReg() == DstReg)
+          NewMIDefinesFullReg = true;
+
+        assert(MO.isImplicit() && MO.getReg().isPhysical() &&
+               (MO.isDead() ||
+                (DefSubIdx &&
+                 (TRI->getSubReg(MO.getReg(), DefSubIdx) ==
+                  MCRegister((unsigned)NewMI.getOperand(0).getReg())))));
+        NewMIImplDefs.push_back(MO.getReg().asMCReg());
+      } else {
+        assert(MO.getReg() == NewMI.getOperand(0).getReg() &&
+               MO.getSubReg() == 0);
+        // We're only expecting another def of the main output, so the range
+        // should get updated with the regular output range.
+        //
+        // FIXME: The range updating below probably needs updating to look at
+        // the super register if subranges are tracked.
+        assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
+               "subrange update for implicit-def of super register may not be "
+               "properly handled");
+      }
     }
   }
 
@@ -1526,8 +1558,12 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     assert(DstReg.isPhysical() &&
            "Only expect virtual or physical registers in remat");
     NewMI.getOperand(0).setIsDead(true);
-    NewMI.addOperand(MachineOperand::CreateReg(
-        CopyDstReg, true /*IsDef*/, true /*IsImp*/, false /*IsKill*/));
+
+    if (!NewMIDefinesFullReg) {
+      NewMI.addOperand(MachineOperand::CreateReg(
+          CopyDstReg, true /*IsDef*/, true /*IsImp*/, false /*IsKill*/));
+    }
+
     // Record small dead def live-ranges for all the subregisters
     // of the destination register.
     // Otherwise, variables that live through may miss some
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 57cd1fcffb6101..36c91b7fa97e46 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -155,8 +155,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // Try to find the mapping for the scalar version of this intrinsic
   // and the exact vector width of the call operands in the
   // TargetLibraryInfo.
-  const std::string TLIName =
-      std::string(TLI.getVectorizedFunction(ScalarName, VF));
+  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF);
 
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
                     << ScalarName << "` and vector width " << VF << ".\n");
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 65801aa0a17d5b..aaaee4dfdfac92 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -425,7 +425,7 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     BasicBlock *StartBlock = SI->getParent();
     BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
     BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
-    BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
+    BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
     // Delete the unconditional branch that was just created by the split.
     StartBlock->getTerminator()->eraseFromParent();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 542ed37f904ba8..f2bd5137bd2093 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9857,9 +9857,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
   }
 
-  if (SimplifyDemandedBits(SDValue(N, 0)))
-    return SDValue(N, 0);
-
   // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
   if (N0.getOpcode() == ISD::SHL) {
     auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
@@ -10077,6 +10074,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     if (SDValue NewSHL = visitShiftByConstant(N))
       return NewSHL;
 
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).
   if (N0.getOpcode() == ISD::VSCALE && N1C) {
     const APInt &C0 = N0.getConstantOperandAPInt(0);
@@ -24151,7 +24151,7 @@ static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N,
 
   // Profitability check: only deal with extractions from the first subvector
   // unless the mask becomes an identity mask.
-  if (!ShuffleVectorInst::isIdentityMask(NewMask) ||
+  if (!ShuffleVectorInst::isIdentityMask(NewMask, NewMask.size()) ||
       any_of(NewMask, [](int M) { return M < 0; }))
     for (auto &DemandedSubvector : DemandedSubvectors)
       if (DemandedSubvector.second != 0)
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index f0affce7b6b805..a831295863399b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1327,6 +1327,14 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
                         << *DI << "\n");
       return true;
     }
+    if (auto SI = FuncInfo.StaticAllocaMap.find(dyn_cast<AllocaInst>(V));
+        SI != FuncInfo.StaticAllocaMap.end()) {
+      MachineOperand FrameIndexOp = MachineOperand::CreateFI(SI->second);
+      bool IsIndirect = false;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), II, IsIndirect,
+              FrameIndexOp, Var, Expr);
+      return true;
+    }
     if (Register Reg = lookUpRegForValue(V)) {
       // FIXME: This does not handle register-indirect values at offset 0.
       if (!FuncInfo.MF->useDebugInstrRef()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index cd21af770e1a4d..764a873768101c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1620,7 +1620,11 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
   if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
                            TargetLowering::TypePromoteInteger) {
     EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
-    APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
+    APInt NewVal;
+    if (TLI->isSExtCheaperThanZExt(VT.getScalarType(), EltVT))
+      NewVal = Elt->getValue().sextOrTrunc(EltVT.getSizeInBits());
+    else
+      NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
     Elt = ConstantInt::get(*getContext(), NewVal);
   }
   // In other cases the element type is illegal and needs to be expanded, for
@@ -4091,8 +4095,11 @@ SelectionDAG::computeOverflowForSignedSub(SDValue N0, SDValue N1) const {
   if (ComputeNumSignBits(N0) > 1 && ComputeNumSignBits(N1) > 1)
     return OFK_Never;
 
-  // TODO: Add ConstantRange::signedSubMayOverflow handling.
-  return OFK_Sometime;
+  KnownBits N0Known = computeKnownBits(N0);
+  KnownBits N1Known = computeKnownBits(N1);
+  ConstantRange N0Range = ConstantRange::fromKnownBits(N0Known, true);
+  ConstantRange N1Range = ConstantRange::fromKnownBits(N1Known, true);
+  return mapOverflowResult(N0Range.signedSubMayOverflow(N1Range));
 }
 
 SelectionDAG::OverflowKind
@@ -4101,8 +4108,11 @@ SelectionDAG::computeOverflowForUnsignedSub(SDValue N0, SDValue N1) const {
   if (isNullConstant(N1))
     return OFK_Never;
 
-  // TODO: Add ConstantRange::unsignedSubMayOverflow handling.
-  return OFK_Sometime;
+  KnownBits N0Known = computeKnownBits(N0);
+  KnownBits N1Known = computeKnownBits(N1);
+  ConstantRange N0Range = ConstantRange::fromKnownBits(N0Known, false);
+  ConstantRange N1Range = ConstantRange::fromKnownBits(N1Known, false);
+  return mapOverflowResult(N0Range.unsignedSubMayOverflow(N1Range));
 }
 
 SelectionDAG::OverflowKind
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 4b1d3637a7462e..ab57d08e527e4d 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -139,7 +139,7 @@ class ShrinkWrap : public MachineFunctionPass {
   MachineOptimizationRemarkEmitter *ORE = nullptr;
 
   /// Frequency of the Entry block.
-  uint64_t EntryFreq = 0;
+  BlockFrequency EntryFreq;
 
   /// Current opcode for frame setup.
   unsigned FrameSetupOpcode = ~0u;
@@ -640,7 +640,7 @@ bool ShrinkWrap::postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
       FindIDom<>(**DirtyPreds.begin(), DirtyPreds, *MDT, false);
 
   while (NewSave && (hasDirtyPred(ReachableByDirty, *NewSave) ||
-                     EntryFreq < MBFI->getBlockFreq(NewSave).getFrequency() ||
+                     EntryFreq < MBFI->getBlockFreq(NewSave) ||
                      /*Entry freq has been observed more than a loop block in
                         some cases*/
                      MLI->getLoopFor(NewSave)))
@@ -675,8 +675,8 @@ bool ShrinkWrap::postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
          "Incorrect save or restore point due to dominance relations");
   assert((!MLI->getLoopFor(Save) && !MLI->getLoopFor(Restore)) &&
          "Unexpected save or restore point in a loop");
-  assert((EntryFreq >= MBFI->getBlockFreq(Save).getFrequency() &&
-          EntryFreq >= MBFI->getBlockFreq(Restore).getFrequency()) &&
+  assert((EntryFreq >= MBFI->getBlockFreq(Save) &&
+          EntryFreq >= MBFI->getBlockFreq(Restore)) &&
          "Incorrect save or restore point based on block frequency");
   return true;
 }
@@ -878,21 +878,21 @@ bool ShrinkWrap::performShrinkWrapping(
     return false;
   }
 
-  LLVM_DEBUG(dbgs() << "\n ** Results **\nFrequency of the Entry: " << EntryFreq
-                    << '\n');
+  LLVM_DEBUG(dbgs() << "\n ** Results **\nFrequency of the Entry: "
+                    << EntryFreq.getFrequency() << '\n');
 
   const TargetFrameLowering *TFI =
       MachineFunc->getSubtarget().getFrameLowering();
   do {
     LLVM_DEBUG(dbgs() << "Shrink wrap candidates (#, Name, Freq):\nSave: "
                       << printMBBReference(*Save) << ' '
-                      << MBFI->getBlockFreq(Save).getFrequency()
+                      << printBlockFreq(*MBFI, *Save)
                       << "\nRestore: " << printMBBReference(*Restore) << ' '
-                      << MBFI->getBlockFreq(Restore).getFrequency() << '\n');
+                      << printBlockFreq(*MBFI, *Restore) << '\n');
 
     bool IsSaveCheap, TargetCanUseSaveAsPrologue = false;
-    if (((IsSaveCheap = EntryFreq >= MBFI->getBlockFreq(Save).getFrequency()) &&
-         EntryFreq >= MBFI->getBlockFreq(Restore).getFrequency()) &&
+    if (((IsSaveCheap = EntryFreq >= MBFI->getBlockFreq(Save)) &&
+         EntryFreq >= MBFI->getBlockFreq(Restore)) &&
         ((TargetCanUseSaveAsPrologue = TFI->canUseAsPrologue(*Save)) &&
          TFI->canUseAsEpilogue(*Restore)))
       break;
diff --git a/llvm/lib/CodeGen/SpillPlacement.cpp b/llvm/lib/CodeGen/SpillPlacement.cpp
index 91da5e49713c1a..6e74e51d678285 100644
--- a/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -109,8 +109,10 @@ struct SpillPlacement::Node {
 
   /// clear - Reset per-query data, but preserve frequencies that only depend on
   /// the CFG.
-  void clear(const BlockFrequency &Threshold) {
-    BiasN = BiasP = Value = 0;
+  void clear(BlockFrequency Threshold) {
+    BiasN = BlockFrequency(0);
+    BiasP = BlockFrequency(0);
+    Value = 0;
     SumLinkWeights = Threshold;
     Links.clear();
   }
@@ -142,14 +144,14 @@ struct SpillPlacement::Node {
       BiasN += freq;
       break;
     case MustSpill:
-      BiasN = BlockFrequency::getMaxFrequency();
+      BiasN = BlockFrequency::max();
       break;
     }
   }
 
   /// update - Recompute Value from Bias and Links. Return true when node
   /// preference changes.
-  bool update(const Node nodes[], const BlockFrequency &Threshold) {
+  bool update(const Node nodes[], BlockFrequency Threshold) {
     // Compute the weighted sum of inputs.
     BlockFrequency SumN = BiasN;
     BlockFrequency SumP = BiasP;
@@ -237,8 +239,10 @@ void SpillPlacement::activate(unsigned n) {
   // limiting the number of blocks visited and the number of links in the
   // Hopfield network.
   if (bundles->getBlocks(n).size() > 100) {
-    nodes[n].BiasP = 0;
-    nodes[n].BiasN = (MBFI->getEntryFreq() / 16);
+    nodes[n].BiasP = BlockFrequency(0);
+    BlockFrequency BiasN = MBFI->getEntryFreq();
+    BiasN >>= 4;
+    nodes[n].BiasN = BiasN;
   }
 }
 
@@ -247,12 +251,12 @@ void SpillPlacement::activate(unsigned n) {
 /// Set the threshold relative to \c Entry.  Since the threshold is used as a
 /// bound on the open interval (-Threshold;Threshold), 1 is the minimum
 /// threshold.
-void SpillPlacement::setThreshold(const BlockFrequency &Entry) {
+void SpillPlacement::setThreshold(BlockFrequency Entry) {
   // Apparently 2 is a good threshold when Entry==2^14, but we need to scale
   // it.  Divide by 2^13, rounding as appropriate.
   uint64_t Freq = Entry.getFrequency();
   uint64_t Scaled = (Freq >> 13) + bool(Freq & (1 << 12));
-  Threshold = std::max(UINT64_C(1), Scaled);
+  Threshold = BlockFrequency(std::max(UINT64_C(1), Scaled));
 }
 
 /// addConstraints - Compute node biases and weights from a set of constraints.
diff --git a/llvm/lib/CodeGen/SpillPlacement.h b/llvm/lib/CodeGen/SpillPlacement.h
index bd37d85c6c0d6d..2a298c7a74aa61 100644
--- a/llvm/lib/CodeGen/SpillPlacement.h
+++ b/llvm/lib/CodeGen/SpillPlacement.h
@@ -162,7 +162,7 @@ class SpillPlacement : public MachineFunctionPass {
   void releaseMemory() override;
 
   void activate(unsigned n);
-  void setThreshold(const BlockFrequency &Entry);
+  void setThreshold(BlockFrequency Entry);
 
   bool update(unsigned n);
 };
diff --git a/llvm/lib/DWARFLinkerParallel/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinkerParallel/DWARFLinkerCompileUnit.cpp
index f66dfbf217b389..96017bac31342a 100644
--- a/llvm/lib/DWARFLinkerParallel/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinkerParallel/DWARFLinkerCompileUnit.cpp
@@ -1090,8 +1090,7 @@ void CompileUnit::cloneDieAttrExpression(
         OutputExpression.push_back(dwarf::DW_OP_addr);
         uint64_t LinkedAddress =
             SA->Address + (VarAddressAdjustment ? *VarAddressAdjustment : 0);
-        if ((getEndianness() == support::endianness::little) !=
-            sys::IsLittleEndianHost)
+        if (getEndianness() != llvm::endianness::native)
           sys::swapByteOrder(LinkedAddress);
         ArrayRef<uint8_t> AddressBytes(
             reinterpret_cast<const uint8_t *>(&LinkedAddress),
@@ -1128,8 +1127,7 @@ void CompileUnit::cloneDieAttrExpression(
           OutputExpression.push_back(*OutOperandKind);
           uint64_t LinkedAddress =
               SA->Address + (VarAddressAdjustment ? *VarAddressAdjustment : 0);
-          if ((getEndianness() == support::endianness::little) !=
-              sys::IsLittleEndianHost)
+          if (getEndianness() != llvm::endianness::native)
             sys::swapByteOrder(LinkedAddress);
           ArrayRef<uint8_t> AddressBytes(
               reinterpret_cast<const uint8_t *>(&LinkedAddress),
diff --git a/llvm/lib/DWARFLinkerParallel/DWARFLinkerImpl.cpp b/llvm/lib/DWARFLinkerParallel/DWARFLinkerImpl.cpp
index 51d38e1b555cd8..a02326f025d5bb 100644
--- a/llvm/lib/DWARFLinkerParallel/DWARFLinkerImpl.cpp
+++ b/llvm/lib/DWARFLinkerParallel/DWARFLinkerImpl.cpp
@@ -64,7 +64,7 @@ Error DWARFLinkerImpl::link() {
 
   dwarf::FormParams GlobalFormat = {GlobalData.getOptions().TargetDWARFVersion,
                                     0, dwarf::DwarfFormat::DWARF32};
-  support::endianness GlobalEndianness = support::endian::system_endianness();
+  support::endianness GlobalEndianness = llvm::endianness::native;
 
   if (TheDwarfEmitter) {
     GlobalEndianness = TheDwarfEmitter->getTargetTriple().isLittleEndian()
diff --git a/llvm/lib/DWARFLinkerParallel/OutputSections.cpp b/llvm/lib/DWARFLinkerParallel/OutputSections.cpp
index 9c711f4b6bdcde..8e7caa7bb40742 100644
--- a/llvm/lib/DWARFLinkerParallel/OutputSections.cpp
+++ b/llvm/lib/DWARFLinkerParallel/OutputSections.cpp
@@ -151,18 +151,18 @@ void SectionDescriptor::emitIntVal(uint64_t Val, unsigned Size) {
   } break;
   case 2: {
     uint16_t ShortVal = static_cast<uint16_t>(Val);
-    if ((Endianess == support::endianness::little) != sys::IsLittleEndianHost)
+    if (Endianess != llvm::endianness::native)
       sys::swapByteOrder(ShortVal);
     OS.write(reinterpret_cast<const char *>(&ShortVal), Size);
   } break;
   case 4: {
     uint32_t ShortVal = static_cast<uint32_t>(Val);
-    if ((Endianess == support::endianness::little) != sys::IsLittleEndianHost)
+    if (Endianess != llvm::endianness::native)
       sys::swapByteOrder(ShortVal);
     OS.write(reinterpret_cast<const char *>(&ShortVal), Size);
   } break;
   case 8: {
-    if ((Endianess == support::endianness::little) != sys::IsLittleEndianHost)
+    if (Endianess != llvm::endianness::native)
       sys::swapByteOrder(Val);
     OS.write(reinterpret_cast<const char *>(&Val), Size);
   } break;
diff --git a/llvm/lib/DWARFLinkerParallel/OutputSections.h b/llvm/lib/DWARFLinkerParallel/OutputSections.h
index 82015e16d32cd1..f34ac25ee3fd43 100644
--- a/llvm/lib/DWARFLinkerParallel/OutputSections.h
+++ b/llvm/lib/DWARFLinkerParallel/OutputSections.h
@@ -432,7 +432,7 @@ class OutputSections {
   dwarf::FormParams Format = {4, 4, dwarf::DWARF32};
 
   /// Endiannes for sections.
-  support::endianness Endianness = support::endian::system_endianness();
+  support::endianness Endianness = llvm::endianness::native;
 
   /// All keeping sections.
   using SectionsSetTy = std::map<DebugSectionKind, SectionDescriptor>;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 1e1ab814673f42..3d943a04da2c22 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -193,7 +193,7 @@ static T &getAccelTable(std::unique_ptr<T> &Cache, const DWARFObject &Obj,
     return *Cache;
   DWARFDataExtractor AccelSection(Obj, Section, IsLittleEndian, 0);
   DataExtractor StrData(StringSection, IsLittleEndian, 0);
-  Cache.reset(new T(AccelSection, StrData));
+  Cache = std::make_unique<T>(AccelSection, StrData);
   if (Error E = Cache->extract())
     llvm::consumeError(std::move(E));
   return *Cache;
@@ -377,7 +377,7 @@ class ThreadUnsafeDWARFContextState : public DWARFContext::DWARFContextState {
             ? DWARFDataExtractor(DObj, DObj.getLocSection(), D.isLittleEndian(),
                                  D.getUnitAtIndex(0)->getAddressByteSize())
             : DWARFDataExtractor("", D.isLittleEndian(), 0);
-    Loc.reset(new DWARFDebugLoc(std::move(Data)));
+    Loc = std::make_unique<DWARFDebugLoc>(std::move(Data));
     return Loc.get();
   }
 
@@ -385,7 +385,7 @@ class ThreadUnsafeDWARFContextState : public DWARFContext::DWARFContextState {
     if (Aranges)
       return Aranges.get();
 
-    Aranges.reset(new DWARFDebugAranges());
+    Aranges = std::make_unique<DWARFDebugAranges>();
     Aranges->generate(&D);
     return Aranges.get();
   }
@@ -393,7 +393,7 @@ class ThreadUnsafeDWARFContextState : public DWARFContext::DWARFContextState {
   Expected<const DWARFDebugLine::LineTable *>
   getLineTableForUnit(DWARFUnit *U, function_ref<void(Error)> RecoverableErrorHandler) override {
     if (!Line)
-      Line.reset(new DWARFDebugLine);
+      Line = std::make_unique<DWARFDebugLine>();
 
     auto UnitDIE = U->getUnitDIE();
     if (!UnitDIE)
@@ -735,9 +735,9 @@ DWARFContext::DWARFContext(std::unique_ptr<const DWARFObject> DObj,
       RecoverableErrorHandler(RecoverableErrorHandler),
       WarningHandler(WarningHandler), DObj(std::move(DObj)) {
         if (ThreadSafe)
-          State.reset(new ThreadSafeState(*this, DWPName));
+          State = std::make_unique<ThreadSafeState>(*this, DWPName);
         else
-          State.reset(new ThreadUnsafeDWARFContextState(*this, DWPName));
+          State = std::make_unique<ThreadUnsafeDWARFContextState>(*this, DWPName);
       }
 
 DWARFContext::~DWARFContext() = default;
diff --git a/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp b/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
index 145a43d3b381bb..07303d551af50b 100644
--- a/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
@@ -101,7 +101,7 @@ uint64_t FunctionInfo::cacheEncoding() {
   if (!isValid())
     return 0;
   raw_svector_ostream OutStrm(EncodingCache);
-  FileWriter FW(OutStrm, support::endian::system_endianness());
+  FileWriter FW(OutStrm, llvm::endianness::native);
   llvm::Expected<uint64_t> Result = encode(FW);
   if (!Result) {
     EncodingCache.clear();
@@ -123,7 +123,7 @@ llvm::Expected<uint64_t> FunctionInfo::encode(FileWriter &Out) const {
   // precompute exactly how big FunctionInfo objects encode into so we can
   // accurately make segments of a specific size.
   if (!EncodingCache.empty() &&
-      support::endian::system_endianness() == Out.getByteOrder()) {
+      llvm::endianness::native == Out.getByteOrder()) {
     // We already encoded this object, just write out the bytes.
     Out.writeData(llvm::ArrayRef<uint8_t>((const uint8_t *)EncodingCache.data(),
                                           EncodingCache.size()));
diff --git a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
index 6afaeea8f598e1..7e6eec71d1ad2c 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
@@ -23,11 +23,10 @@
 using namespace llvm;
 using namespace gsym;
 
-GsymReader::GsymReader(std::unique_ptr<MemoryBuffer> Buffer) :
-    MemBuffer(std::move(Buffer)),
-    Endian(support::endian::system_endianness()) {}
+GsymReader::GsymReader(std::unique_ptr<MemoryBuffer> Buffer)
+    : MemBuffer(std::move(Buffer)), Endian(llvm::endianness::native) {}
 
-  GsymReader::GsymReader(GsymReader &&RHS) = default;
+GsymReader::GsymReader(GsymReader &&RHS) = default;
 
 GsymReader::~GsymReader() = default;
 
@@ -60,8 +59,7 @@ GsymReader::create(std::unique_ptr<MemoryBuffer> &MemBuffer) {
 
 llvm::Error
 GsymReader::parse() {
-  BinaryStreamReader FileData(MemBuffer->getBuffer(),
-                              support::endian::system_endianness());
+  BinaryStreamReader FileData(MemBuffer->getBuffer(), llvm::endianness::native);
   // Check for the magic bytes. This file format is designed to be mmap'ed
   // into a process and accessed as read only. This is done for performance
   // and efficiency for symbolicating and parsing GSYM data.
@@ -69,7 +67,7 @@ GsymReader::parse() {
     return createStringError(std::errc::invalid_argument,
                              "not enough data for a GSYM header");
 
-  const auto HostByteOrder = support::endian::system_endianness();
+  const auto HostByteOrder = llvm::endianness::native;
   switch (Hdr->Magic) {
     case GSYM_MAGIC:
       Endian = HostByteOrder;
@@ -261,7 +259,10 @@ llvm::Expected<FunctionInfo> GsymReader::getFunctionInfo(uint64_t Addr) const {
   // Address info offsets size should have been checked in parse().
   assert(*AddressIndex < AddrInfoOffsets.size());
   auto AddrInfoOffset = AddrInfoOffsets[*AddressIndex];
-  DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset), Endian, 4);
+  assert((Endian == support::big || Endian == support::little) &&
+         "Endian must be either big or little");
+  DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset),
+                     Endian == support::little, 4);
   if (std::optional<uint64_t> OptAddr = getAddress(*AddressIndex)) {
     auto ExpectedFI = FunctionInfo::decode(Data, *OptAddr);
     if (ExpectedFI) {
@@ -283,7 +284,10 @@ llvm::Expected<LookupResult> GsymReader::lookup(uint64_t Addr) const {
   // Address info offsets size should have been checked in parse().
   assert(*AddressIndex < AddrInfoOffsets.size());
   auto AddrInfoOffset = AddrInfoOffsets[*AddressIndex];
-  DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset), Endian, 4);
+  assert((Endian == support::big || Endian == support::little) &&
+         "Endian must be either big or little");
+  DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset),
+                     Endian == support::little, 4);
   if (std::optional<uint64_t> OptAddr = getAddress(*AddressIndex))
     return FunctionInfo::lookup(Data, *this, *OptAddr, Addr);
   return createStringError(std::errc::invalid_argument,
diff --git a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
index c26caa647ed9f6..e909e851a9051d 100644
--- a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
+++ b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/TimeProfiler.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -248,6 +249,8 @@ uint32_t MSFBuilder::computeDirectoryByteSize() const {
 }
 
 Expected<MSFLayout> MSFBuilder::generateLayout() {
+  llvm::TimeTraceScope timeScope("MSF: Generate layout");
+
   SuperBlock *SB = Allocator.Allocate<SuperBlock>();
   MSFLayout L;
   L.SB = SB;
@@ -336,6 +339,8 @@ static void commitFpm(WritableBinaryStream &MsfBuffer, const MSFLayout &Layout,
 
 Expected<FileBufferByteStream> MSFBuilder::commit(StringRef Path,
                                                   MSFLayout &Layout) {
+  llvm::TimeTraceScope timeScope("Commit MSF");
+
   Expected<MSFLayout> L = generateLayout();
   if (!L)
     return L.takeError();
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index 37c1b0407268cb..fb8b11b9c1588d 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Parallel.h"
+#include "llvm/Support/TimeProfiler.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -381,6 +382,7 @@ void DbiStreamBuilder::createSectionMap(
 
 Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
                                WritableBinaryStreamRef MsfBuffer) {
+  llvm::TimeTraceScope timeScope("Commit DBI stream");
   if (auto EC = finalize())
     return EC;
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index b17fbd63e9fd4d..9f9bd47aa63387 100644
--- a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Parallel.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/xxhash.h"
 #include <algorithm>
 #include <vector>
@@ -478,6 +479,7 @@ Error GSIStreamBuilder::commitGlobalsHashStream(
 
 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
                                WritableBinaryStreamRef Buffer) {
+  llvm::TimeTraceScope timeScope("Commit GSI stream");
   auto GS = WritableMappedBlockStream::createIndexedStream(
       Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
   auto PS = WritableMappedBlockStream::createIndexedStream(
diff --git a/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index e8f5a451b08ea7..95107125701de2 100644
--- a/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -14,6 +14,7 @@
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/TimeProfiler.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -55,6 +56,7 @@ Error InfoStreamBuilder::finalizeMsfLayout() {
 
 Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
                                 WritableBinaryStreamRef Buffer) const {
+  llvm::TimeTraceScope timeScope("Commit info stream");
   auto InfoS = WritableMappedBlockStream::createIndexedStream(
       Layout, Buffer, StreamPDB, Msf.getAllocator());
   BinaryStreamWriter Writer(*InfoS);
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index cd30b56be7cd85..06e379c3f6d237 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/xxhash.h"
 
 #include <ctime>
@@ -129,6 +130,7 @@ void PDBFileBuilder::addInjectedSource(StringRef Name,
 }
 
 Error PDBFileBuilder::finalizeMsfLayout() {
+  llvm::TimeTraceScope timeScope("MSF layout");
 
   if (Ipi && Ipi->getRecordCount() > 0) {
     // In theory newer PDBs always have an ID stream, but by saying that we're
@@ -254,6 +256,7 @@ void PDBFileBuilder::commitInjectedSources(WritableBinaryStream &MsfBuffer,
   if (InjectedSourceTable.empty())
     return;
 
+  llvm::TimeTraceScope timeScope("Commit injected sources");
   commitSrcHeaderBlock(MsfBuffer, Layout);
 
   for (const auto &IS : InjectedSources) {
@@ -290,15 +293,18 @@ Error PDBFileBuilder::commit(StringRef Filename, codeview::GUID *Guid) {
   if (auto EC = Strings.commit(NSWriter))
     return EC;
 
-  for (const auto &NSE : NamedStreamData) {
-    if (NSE.second.empty())
-      continue;
-
-    auto NS = WritableMappedBlockStream::createIndexedStream(
-        Layout, Buffer, NSE.first, Allocator);
-    BinaryStreamWriter NSW(*NS);
-    if (auto EC = NSW.writeBytes(arrayRefFromStringRef(NSE.second)))
-      return EC;
+  {
+    llvm::TimeTraceScope timeScope("Named stream data");
+    for (const auto &NSE : NamedStreamData) {
+      if (NSE.second.empty())
+        continue;
+
+      auto NS = WritableMappedBlockStream::createIndexedStream(
+          Layout, Buffer, NSE.first, Allocator);
+      BinaryStreamWriter NSW(*NS);
+      if (auto EC = NSW.writeBytes(arrayRefFromStringRef(NSE.second)))
+        return EC;
+    }
   }
 
   if (Info) {
@@ -338,6 +344,8 @@ Error PDBFileBuilder::commit(StringRef Filename, codeview::GUID *Guid) {
   // Set the build id at the very end, after every other byte of the PDB
   // has been written.
   if (Info->hashPDBContentsToGUID()) {
+    llvm::TimeTraceScope timeScope("Compute build ID");
+
     // Compute a hash of all sections of the output file.
     uint64_t Digest =
         xxh3_64bits({Buffer.getBufferStart(), Buffer.getBufferEnd()});
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
index c0245dc17cf1ef..91b3dd5c32b9f6 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
@@ -13,6 +13,7 @@
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/TimeProfiler.h"
 
 #include <map>
 
@@ -207,6 +208,7 @@ Error PDBStringTableBuilder::writeEpilogue(BinaryStreamWriter &Writer) const {
 }
 
 Error PDBStringTableBuilder::commit(BinaryStreamWriter &Writer) const {
+  llvm::TimeTraceScope timeScope("Commit strings table");
   BinaryStreamWriter SectionWriter;
 
   std::tie(SectionWriter, Writer) = Writer.split(sizeof(PDBStringTableHeader));
diff --git a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index aad5847651a0db..30d5a563e8bdae 100644
--- a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/TimeProfiler.h"
 #include <algorithm>
 #include <cstdint>
 #include <numeric>
@@ -171,6 +172,7 @@ Error TpiStreamBuilder::finalizeMsfLayout() {
 
 Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
                                WritableBinaryStreamRef Buffer) {
+  llvm::TimeTraceScope timeScope("Commit TPI stream");
   if (auto EC = finalize())
     return EC;
 
diff --git a/llvm/lib/Debuginfod/HTTPClient.cpp b/llvm/lib/Debuginfod/HTTPClient.cpp
index f9201e4f96268c..4cca250746a597 100644
--- a/llvm/lib/Debuginfod/HTTPClient.cpp
+++ b/llvm/lib/Debuginfod/HTTPClient.cpp
@@ -97,6 +97,8 @@ HTTPClient::HTTPClient() {
   assert(Curl && "Curl could not be initialized");
   // Set the callback hooks.
   curl_easy_setopt(Curl, CURLOPT_WRITEFUNCTION, curlWriteFunction);
+  // Detect supported compressed encodings and accept all.
+  curl_easy_setopt(Curl, CURLOPT_ACCEPT_ENCODING, "");
 }
 
 HTTPClient::~HTTPClient() { curl_easy_cleanup(Curl); }
diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
index 86249591a9be05..c095b13e8c30c5 100644
--- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
@@ -138,71 +138,52 @@ Error EHFrameEdgeFixer::processBlock(ParseContext &PC, Block &B) {
       BlockEdges[E.getOffset()] = EdgeTarget(E);
     }
 
-  CIEInfosMap CIEInfos;
   BinaryStreamReader BlockReader(
       StringRef(B.getContent().data(), B.getContent().size()),
       PC.G.getEndianness());
-  while (!BlockReader.empty()) {
-    size_t RecordStartOffset = BlockReader.getOffset();
 
-    LLVM_DEBUG({
-      dbgs() << "    Processing CFI record at "
-             << (B.getAddress() + RecordStartOffset) << "\n";
-    });
-
-    // Get the record length.
-    Expected<size_t> RecordRemaining = readCFIRecordLength(B, BlockReader);
-    if (!RecordRemaining)
-      return RecordRemaining.takeError();
-
-    if (BlockReader.bytesRemaining() < *RecordRemaining)
-      return make_error<JITLinkError>(
-          "Incomplete CFI record at " +
-          formatv("{0:x16}", B.getAddress() + RecordStartOffset));
+  // Get the record length.
+  Expected<size_t> RecordRemaining = readCFIRecordLength(B, BlockReader);
+  if (!RecordRemaining)
+    return RecordRemaining.takeError();
+
+  // We expect DWARFRecordSectionSplitter to split each CFI record into its own
+  // block.
+  if (BlockReader.bytesRemaining() != *RecordRemaining)
+    return make_error<JITLinkError>("Incomplete CFI record at " +
+                                    formatv("{0:x16}", B.getAddress()));
+
+  // Read the CIE delta for this record.
+  uint64_t CIEDeltaFieldOffset = BlockReader.getOffset();
+  uint32_t CIEDelta;
+  if (auto Err = BlockReader.readInteger(CIEDelta))
+    return Err;
 
-    // Read the CIE delta for this record.
-    uint64_t CIEDeltaFieldOffset = BlockReader.getOffset() - RecordStartOffset;
-    uint32_t CIEDelta;
-    if (auto Err = BlockReader.readInteger(CIEDelta))
+  if (CIEDelta == 0) {
+    if (auto Err = processCIE(PC, B, CIEDeltaFieldOffset, BlockEdges))
+      return Err;
+  } else {
+    if (auto Err = processFDE(PC, B, CIEDeltaFieldOffset, CIEDelta, BlockEdges))
       return Err;
-
-    if (CIEDelta == 0) {
-      if (auto Err = processCIE(PC, B, RecordStartOffset,
-                                CIEDeltaFieldOffset + *RecordRemaining,
-                                CIEDeltaFieldOffset, BlockEdges))
-        return Err;
-    } else {
-      if (auto Err = processFDE(PC, B, RecordStartOffset,
-                                CIEDeltaFieldOffset + *RecordRemaining,
-                                CIEDeltaFieldOffset, CIEDelta, BlockEdges))
-        return Err;
-    }
-
-    // Move to the next record.
-    BlockReader.setOffset(RecordStartOffset + CIEDeltaFieldOffset +
-                          *RecordRemaining);
   }
 
   return Error::success();
 }
 
 Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
-                                   size_t RecordOffset, size_t RecordLength,
                                    size_t CIEDeltaFieldOffset,
                                    const BlockEdgeMap &BlockEdges) {
 
-  LLVM_DEBUG(dbgs() << "      Record is CIE\n");
+  LLVM_DEBUG(dbgs() << "    Record is CIE\n");
 
-  auto RecordContent = B.getContent().slice(RecordOffset, RecordLength);
   BinaryStreamReader RecordReader(
-      StringRef(RecordContent.data(), RecordContent.size()),
+      StringRef(B.getContent().data(), B.getContent().size()),
       PC.G.getEndianness());
 
   // Skip past the CIE delta field: we've already processed this far.
   RecordReader.setOffset(CIEDeltaFieldOffset + 4);
 
-  auto &CIESymbol =
-      PC.G.addAnonymousSymbol(B, RecordOffset, RecordLength, false, false);
+  auto &CIESymbol = PC.G.addAnonymousSymbol(B, 0, B.getSize(), false, false);
   CIEInformation CIEInfo(CIESymbol);
 
   uint8_t Version = 0;
@@ -268,7 +249,7 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
         if (auto Err =
                 getOrCreateEncodedPointerEdge(
                     PC, BlockEdges, *PersonalityPointerEncoding, RecordReader,
-                    B, RecordOffset + RecordReader.getOffset(), "personality")
+                    B, RecordReader.getOffset(), "personality")
                     .takeError())
           return Err;
         break;
@@ -279,7 +260,7 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
           if (CIEInfo.AddressEncoding == dwarf::DW_EH_PE_omit)
             return make_error<JITLinkError>(
                 "Invalid address encoding DW_EH_PE_omit in CIE at " +
-                formatv("{0:x}", (B.getAddress() + RecordOffset).getValue()));
+                formatv("{0:x}", B.getAddress().getValue()));
         } else
           return PE.takeError();
         break;
@@ -302,69 +283,50 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
 }
 
 Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
-                                   size_t RecordOffset, size_t RecordLength,
                                    size_t CIEDeltaFieldOffset,
                                    uint32_t CIEDelta,
                                    const BlockEdgeMap &BlockEdges) {
-  LLVM_DEBUG(dbgs() << "      Record is FDE\n");
+  LLVM_DEBUG(dbgs() << "    Record is FDE\n");
 
-  orc::ExecutorAddr RecordAddress = B.getAddress() + RecordOffset;
+  orc::ExecutorAddr RecordAddress = B.getAddress();
 
-  auto RecordContent = B.getContent().slice(RecordOffset, RecordLength);
   BinaryStreamReader RecordReader(
-      StringRef(RecordContent.data(), RecordContent.size()),
+      StringRef(B.getContent().data(), B.getContent().size()),
       PC.G.getEndianness());
 
   // Skip past the CIE delta field: we've already read this far.
   RecordReader.setOffset(CIEDeltaFieldOffset + 4);
 
-  auto &FDESymbol =
-      PC.G.addAnonymousSymbol(B, RecordOffset, RecordLength, false, false);
+  auto &FDESymbol = PC.G.addAnonymousSymbol(B, 0, B.getSize(), false, false);
 
   CIEInformation *CIEInfo = nullptr;
 
   {
     // Process the CIE pointer field.
-    auto CIEEdgeItr = BlockEdges.find(RecordOffset + CIEDeltaFieldOffset);
+    auto CIEEdgeItr = BlockEdges.find(CIEDeltaFieldOffset);
+    if (CIEEdgeItr != BlockEdges.end())
+      return make_error<JITLinkError>(
+          "CIE pointer field already has edge at " +
+          formatv("{0:x16}", RecordAddress + CIEDeltaFieldOffset));
+
     orc::ExecutorAddr CIEAddress =
         RecordAddress + orc::ExecutorAddrDiff(CIEDeltaFieldOffset) -
         orc::ExecutorAddrDiff(CIEDelta);
-    if (CIEEdgeItr == BlockEdges.end()) {
-
-      LLVM_DEBUG({
-        dbgs() << "        Adding edge at "
-               << (RecordAddress + CIEDeltaFieldOffset)
-               << " to CIE at: " << CIEAddress << "\n";
-      });
-      if (auto CIEInfoOrErr = PC.findCIEInfo(CIEAddress))
-        CIEInfo = *CIEInfoOrErr;
-      else
-        return CIEInfoOrErr.takeError();
-      assert(CIEInfo->CIESymbol && "CIEInfo has no CIE symbol set");
-      B.addEdge(NegDelta32, RecordOffset + CIEDeltaFieldOffset,
-                *CIEInfo->CIESymbol, 0);
-    } else {
-      LLVM_DEBUG({
-        dbgs() << "        Already has edge at "
-               << (RecordAddress + CIEDeltaFieldOffset) << " to CIE at "
-               << CIEAddress << "\n";
-      });
-      auto &EI = CIEEdgeItr->second;
-      if (EI.Addend)
-        return make_error<JITLinkError>(
-            "CIE edge at " +
-            formatv("{0:x16}", RecordAddress + CIEDeltaFieldOffset) +
-            " has non-zero addend");
-      if (auto CIEInfoOrErr = PC.findCIEInfo(EI.Target->getAddress()))
-        CIEInfo = *CIEInfoOrErr;
-      else
-        return CIEInfoOrErr.takeError();
-    }
+    LLVM_DEBUG({
+      dbgs() << "      Adding edge at " << (RecordAddress + CIEDeltaFieldOffset)
+             << " to CIE at: " << CIEAddress << "\n";
+    });
+    if (auto CIEInfoOrErr = PC.findCIEInfo(CIEAddress))
+      CIEInfo = *CIEInfoOrErr;
+    else
+      return CIEInfoOrErr.takeError();
+    assert(CIEInfo->CIESymbol && "CIEInfo has no CIE symbol set");
+    B.addEdge(NegDelta32, CIEDeltaFieldOffset, *CIEInfo->CIESymbol, 0);
   }
 
   // Process the PC-Begin field.
   LLVM_DEBUG({
-    dbgs() << "        Processing PC-begin at "
+    dbgs() << "      Processing PC-begin at "
            << (RecordAddress + RecordReader.getOffset()) << "\n";
   });
   if (auto PCBegin = getOrCreateEncodedPointerEdge(
@@ -375,14 +337,14 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
       // Add a keep-alive edge from the FDE target to the FDE to ensure that the
       // FDE is kept alive if its target is.
       LLVM_DEBUG({
-        dbgs() << "        Adding keep-alive edge from target at "
+        dbgs() << "      Adding keep-alive edge from target at "
                << (*PCBegin)->getBlock().getAddress() << " to FDE at "
                << RecordAddress << "\n";
       });
       (*PCBegin)->getBlock().addEdge(Edge::KeepAlive, 0, FDESymbol, 0);
     } else {
       LLVM_DEBUG({
-        dbgs() << "        WARNING: Not adding keep-alive edge to FDE at "
+        dbgs() << "      WARNING: Not adding keep-alive edge to FDE at "
                << RecordAddress << ", which points to "
                << ((*PCBegin)->isExternal() ? "external" : "absolute")
                << " symbol \"" << (*PCBegin)->getName()
@@ -409,7 +371,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
                          .takeError())
         return Err;
   } else {
-    LLVM_DEBUG(dbgs() << "        Record does not have LSDA field.\n");
+    LLVM_DEBUG(dbgs() << "      Record does not have LSDA field.\n");
   }
 
   return Error::success();
@@ -534,7 +496,7 @@ Expected<Symbol *> EHFrameEdgeFixer::getOrCreateEncodedPointerEdge(
     auto EdgeI = BlockEdges.find(PointerFieldOffset);
     if (EdgeI != BlockEdges.end()) {
       LLVM_DEBUG({
-        dbgs() << "        Existing edge at "
+        dbgs() << "      Existing edge at "
                << (BlockToFix.getAddress() + PointerFieldOffset) << " to "
                << FieldName << " at " << EdgeI->second.Target->getAddress();
         if (EdgeI->second.Target->hasName())
@@ -596,7 +558,7 @@ Expected<Symbol *> EHFrameEdgeFixer::getOrCreateEncodedPointerEdge(
   BlockToFix.addEdge(PtrEdgeKind, PointerFieldOffset, *TargetSym, 0);
 
   LLVM_DEBUG({
-    dbgs() << "        Adding edge at "
+    dbgs() << "      Adding edge at "
            << (BlockToFix.getAddress() + PointerFieldOffset) << " to "
            << FieldName << " at " << TargetSym->getAddress();
     if (TargetSym->hasName())
diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
index 55cf7fc63ee795..cc3c24cc85ead0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
+++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
@@ -81,11 +81,9 @@ class EHFrameEdgeFixer {
   };
 
   Error processBlock(ParseContext &PC, Block &B);
-  Error processCIE(ParseContext &PC, Block &B, size_t RecordOffset,
-                   size_t RecordLength, size_t CIEDeltaFieldOffset,
+  Error processCIE(ParseContext &PC, Block &B, size_t CIEDeltaFieldOffset,
                    const BlockEdgeMap &BlockEdges);
-  Error processFDE(ParseContext &PC, Block &B, size_t RecordOffset,
-                   size_t RecordLength, size_t CIEDeltaFieldOffset,
+  Error processFDE(ParseContext &PC, Block &B, size_t CIEDeltaFieldOffset,
                    uint32_t CIEDelta, const BlockEdgeMap &BlockEdges);
 
   Expected<AugmentationInfo>
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 37ced88169b0ac..124955236cf96a 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -306,7 +306,6 @@ void writeImmediate(WritableArmRelocation &R, uint32_t Imm) {
 
 Expected<int64_t> readAddendData(LinkGraph &G, Block &B, const Edge &E) {
   support::endianness Endian = G.getEndianness();
-  assert(Endian != support::native && "Declare as little or big explicitly");
 
   Edge::Kind Kind = E.getKind();
   const char *BlockWorkingMem = B.getContent().data();
@@ -404,7 +403,6 @@ Error applyFixupData(LinkGraph &G, Block &B, const Edge &E) {
   char *FixupPtr = BlockWorkingMem + E.getOffset();
 
   auto Write32 = [FixupPtr, Endian = G.getEndianness()](int64_t Value) {
-    assert(Endian != native && "Must be explicit: little or big");
     assert(isInt<32>(Value) && "Must be in signed 32-bit range");
     uint32_t Imm = static_cast<int32_t>(Value);
     if (LLVM_LIKELY(Endian == little))
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
index 13a0f83da8b515..2f70a11944e71f 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
@@ -158,8 +158,12 @@ class MachODebugObjectSynthesizer : public MachODebugObjectSynthesizerBase {
 
     std::optional<StringRef> FileName;
     if (!DebugLineSectionData.empty()) {
+      assert((G.getEndianness() == support::endianness::big ||
+              G.getEndianness() == support::endianness::little) &&
+             "G.getEndianness() must be either big or little");
       auto DWARFCtx = DWARFContext::create(DebugSectionMap, G.getPointerSize(),
-                                           G.getEndianness());
+                                           G.getEndianness() ==
+                                               support::endianness::little);
       DWARFDataExtractor DebugLineData(
           DebugLineSectionData,
           G.getEndianness() == support::endianness::little, G.getPointerSize());
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index a3a766d602c1ae..6b73366f3dc667 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -146,7 +146,7 @@ class MachOHeaderMaterializationUnit : public MaterializationUnit {
     Hdr.flags = 0;
     Hdr.reserved = 0;
 
-    if (G.getEndianness() != support::endian::system_endianness())
+    if (G.getEndianness() != llvm::endianness::native)
       MachO::swapStruct(Hdr);
 
     auto HeaderContent = G.allocateContent(
@@ -1460,7 +1460,7 @@ Error MachOPlatform::MachOPlatformPlugin::populateObjCRuntimeObject(
   auto SecContent = SecBlock.getAlreadyMutableContent();
   char *P = SecContent.data();
   auto WriteMachOStruct = [&](auto S) {
-    if (G.getEndianness() != support::endian::system_endianness())
+    if (G.getEndianness() != llvm::endianness::native)
       MachO::swapStruct(S);
     memcpy(P, &S, sizeof(S));
     P += sizeof(S);
diff --git a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
index d4cbd1970d8f5d..70b536d2feda0f 100644
--- a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
@@ -67,7 +67,7 @@ void IRSpeculationLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
     auto SpeculatorVTy = StructType::create(MContext, "Class.Speculator");
     auto RuntimeCallTy = FunctionType::get(
         Type::getVoidTy(MContext),
-        {SpeculatorVTy->getPointerTo(), Type::getInt64Ty(MContext)}, false);
+        {PointerType::getUnqual(MContext), Type::getInt64Ty(MContext)}, false);
     auto RuntimeCall =
         Function::Create(RuntimeCallTy, Function::LinkageTypes::ExternalLinkage,
                          "__orc_speculate_for", &M);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 9c70d384e55db2..72e1af55fe63f6 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6126,6 +6126,42 @@ OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
                                std::get<1>(FileIDInfo));
 }
 
+unsigned OpenMPIRBuilder::getFlagMemberOffset() {
+  unsigned Offset = 0;
+  for (uint64_t Remain =
+           static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
+               omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF);
+       !(Remain & 1); Remain = Remain >> 1)
+    Offset++;
+  return Offset;
+}
+
+omp::OpenMPOffloadMappingFlags
+OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
+  // Rotate by getFlagMemberOffset() bits.
+  return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
+                                                     << getFlagMemberOffset());
+}
+
+void OpenMPIRBuilder::setCorrectMemberOfFlag(
+    omp::OpenMPOffloadMappingFlags &Flags,
+    omp::OpenMPOffloadMappingFlags MemberOfFlag) {
+  // If the entry is PTR_AND_OBJ but has not been marked with the special
+  // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
+  // marked as MEMBER_OF.
+  if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
+          Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ) &&
+      static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
+          (Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF) !=
+          omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF))
+    return;
+
+  // Reset the placeholder value to prepare the flag for the assignment of the
+  // proper MEMBER_OF value.
+  Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
+  Flags |= MemberOfFlag;
+}
+
 Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
     OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
     OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 6589bd33be7619..58cbde1bfb530d 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -2331,6 +2331,28 @@ bool ConstantExpr::isSupportedBinOp(unsigned Opcode) {
   }
 }
 
+bool ConstantExpr::isDesirableCastOp(unsigned Opcode) {
+  switch (Opcode) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return false;
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+    return true;
+  default:
+    llvm_unreachable("Argument must be cast opcode");
+  }
+}
+
 Constant *ConstantExpr::getSizeOf(Type* Ty) {
   // sizeof is implemented as: (i64) gep (Ty*)null, 1
   // Note that a non-inbounds gep is used, as null isn't within any object.
@@ -2345,7 +2367,7 @@ Constant *ConstantExpr::getAlignOf(Type* Ty) {
   // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
   // Note that a non-inbounds gep is used, as null isn't within any object.
   Type *AligningTy = StructType::get(Type::getInt1Ty(Ty->getContext()), Ty);
-  Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0));
+  Constant *NullPtr = Constant::getNullValue(PointerType::getUnqual(AligningTy->getContext()));
   Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
   Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
   Constant *Indices[2] = { Zero, One };
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b76e355996ea53..ece3b58792dd1f 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2136,10 +2136,10 @@ static bool isSingleSourceMaskImpl(ArrayRef<int> Mask, int NumOpElts) {
   return UsesLHS || UsesRHS;
 }
 
-bool ShuffleVectorInst::isSingleSourceMask(ArrayRef<int> Mask) {
+bool ShuffleVectorInst::isSingleSourceMask(ArrayRef<int> Mask, int NumSrcElts) {
   // We don't have vector operand size information, so assume operands are the
   // same size as the mask.
-  return isSingleSourceMaskImpl(Mask, Mask.size());
+  return isSingleSourceMaskImpl(Mask, NumSrcElts);
 }
 
 static bool isIdentityMaskImpl(ArrayRef<int> Mask, int NumOpElts) {
@@ -2154,65 +2154,75 @@ static bool isIdentityMaskImpl(ArrayRef<int> Mask, int NumOpElts) {
   return true;
 }
 
-bool ShuffleVectorInst::isIdentityMask(ArrayRef<int> Mask) {
+bool ShuffleVectorInst::isIdentityMask(ArrayRef<int> Mask, int NumSrcElts) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
   // We don't have vector operand size information, so assume operands are the
   // same size as the mask.
-  return isIdentityMaskImpl(Mask, Mask.size());
+  return isIdentityMaskImpl(Mask, NumSrcElts);
 }
 
-bool ShuffleVectorInst::isReverseMask(ArrayRef<int> Mask) {
-  if (!isSingleSourceMask(Mask))
+bool ShuffleVectorInst::isReverseMask(ArrayRef<int> Mask, int NumSrcElts) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
+  if (!isSingleSourceMask(Mask, NumSrcElts))
     return false;
 
   // The number of elements in the mask must be at least 2.
-  int NumElts = Mask.size();
-  if (NumElts < 2)
+  if (NumSrcElts < 2)
     return false;
 
-  for (int i = 0; i < NumElts; ++i) {
-    if (Mask[i] == -1)
+  for (int I = 0, E = Mask.size(); I < E; ++I) {
+    if (Mask[I] == -1)
       continue;
-    if (Mask[i] != (NumElts - 1 - i) && Mask[i] != (NumElts + NumElts - 1 - i))
+    if (Mask[I] != (NumSrcElts - 1 - I) &&
+        Mask[I] != (NumSrcElts + NumSrcElts - 1 - I))
       return false;
   }
   return true;
 }
 
-bool ShuffleVectorInst::isZeroEltSplatMask(ArrayRef<int> Mask) {
-  if (!isSingleSourceMask(Mask))
+bool ShuffleVectorInst::isZeroEltSplatMask(ArrayRef<int> Mask, int NumSrcElts) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
     return false;
-  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
-    if (Mask[i] == -1)
+  if (!isSingleSourceMask(Mask, NumSrcElts))
+    return false;
+  for (int I = 0, E = Mask.size(); I < E; ++I) {
+    if (Mask[I] == -1)
       continue;
-    if (Mask[i] != 0 && Mask[i] != NumElts)
+    if (Mask[I] != 0 && Mask[I] != NumSrcElts)
       return false;
   }
   return true;
 }
 
-bool ShuffleVectorInst::isSelectMask(ArrayRef<int> Mask) {
+bool ShuffleVectorInst::isSelectMask(ArrayRef<int> Mask, int NumSrcElts) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
   // Select is differentiated from identity. It requires using both sources.
-  if (isSingleSourceMask(Mask))
+  if (isSingleSourceMask(Mask, NumSrcElts))
     return false;
-  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
-    if (Mask[i] == -1)
+  for (int I = 0, E = Mask.size(); I < E; ++I) {
+    if (Mask[I] == -1)
       continue;
-    if (Mask[i] != i && Mask[i] != (NumElts + i))
+    if (Mask[I] != I && Mask[I] != (NumSrcElts + I))
       return false;
   }
   return true;
 }
 
-bool ShuffleVectorInst::isTransposeMask(ArrayRef<int> Mask) {
+bool ShuffleVectorInst::isTransposeMask(ArrayRef<int> Mask, int NumSrcElts) {
   // Example masks that will return true:
   // v1 = <a, b, c, d>
   // v2 = <e, f, g, h>
   // trn1 = shufflevector v1, v2 <0, 4, 2, 6> = <a, e, c, g>
   // trn2 = shufflevector v1, v2 <1, 5, 3, 7> = <b, f, d, h>
 
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
   // 1. The number of elements in the mask must be a power-of-2 and at least 2.
-  int NumElts = Mask.size();
-  if (NumElts < 2 || !isPowerOf2_32(NumElts))
+  int Sz = Mask.size();
+  if (Sz < 2 || !isPowerOf2_32(Sz))
     return false;
 
   // 2. The first element of the mask must be either a 0 or a 1.
@@ -2221,23 +2231,26 @@ bool ShuffleVectorInst::isTransposeMask(ArrayRef<int> Mask) {
 
   // 3. The difference between the first 2 elements must be equal to the
   // number of elements in the mask.
-  if ((Mask[1] - Mask[0]) != NumElts)
+  if ((Mask[1] - Mask[0]) != NumSrcElts)
     return false;
 
   // 4. The difference between consecutive even-numbered and odd-numbered
   // elements must be equal to 2.
-  for (int i = 2; i < NumElts; ++i) {
-    int MaskEltVal = Mask[i];
+  for (int I = 2; I < Sz; ++I) {
+    int MaskEltVal = Mask[I];
     if (MaskEltVal == -1)
       return false;
-    int MaskEltPrevVal = Mask[i - 2];
+    int MaskEltPrevVal = Mask[I - 2];
     if (MaskEltVal - MaskEltPrevVal != 2)
       return false;
   }
   return true;
 }
 
-bool ShuffleVectorInst::isSpliceMask(ArrayRef<int> Mask, int &Index) {
+bool ShuffleVectorInst::isSpliceMask(ArrayRef<int> Mask, int NumSrcElts,
+                                     int &Index) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
   // Example: shufflevector <4 x n> A, <4 x n> B, <1,2,3,4>
   int StartIndex = -1;
   for (int I = 0, E = Mask.size(); I != E; ++I) {
@@ -2248,7 +2261,7 @@ bool ShuffleVectorInst::isSpliceMask(ArrayRef<int> Mask, int &Index) {
     if (StartIndex == -1) {
       // Don't support a StartIndex that begins in the second input, or if the
       // first non-undef index would access below the StartIndex.
-      if (MaskEltVal < I || E <= (MaskEltVal - I))
+      if (MaskEltVal < I || NumSrcElts <= (MaskEltVal - I))
         return false;
 
       StartIndex = MaskEltVal - I;
@@ -2543,7 +2556,7 @@ bool ShuffleVectorInst::isOneUseSingleSourceMask(int VF) const {
   // case.
   if (isa<ScalableVectorType>(getType()))
     return false;
-  if (!isSingleSourceMask(ShuffleMask))
+  if (!isSingleSourceMask(ShuffleMask, VF))
     return false;
 
   return isOneUseSingleSourceMask(ShuffleMask, VF);
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index acff1e2cf3b33b..443439b71e7566 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -151,6 +151,7 @@ static StringMap<lto::InputFile *>
 generateModuleMap(std::vector<std::unique_ptr<lto::InputFile>> &Modules) {
   StringMap<lto::InputFile *> ModuleMap;
   for (auto &M : Modules) {
+    LLVM_DEBUG(dbgs() << "Adding module " << M->getName() << " to ModuleMap\n");
     assert(!ModuleMap.contains(M->getName()) &&
            "Expect unique Buffer Identifier");
     ModuleMap[M->getName()] = M.get();
diff --git a/llvm/lib/MC/DXContainerPSVInfo.cpp b/llvm/lib/MC/DXContainerPSVInfo.cpp
index 533659053c36f3..bdc6f79a68c0c9 100644
--- a/llvm/lib/MC/DXContainerPSVInfo.cpp
+++ b/llvm/lib/MC/DXContainerPSVInfo.cpp
@@ -161,3 +161,51 @@ void PSVRuntimeInfo::write(raw_ostream &OS, uint32_t Version) const {
   support::endian::write_array(OS, ArrayRef<uint32_t>(PatchOutputMap),
                                support::little);
 }
+
+void Signature::write(raw_ostream &OS) {
+  SmallVector<dxbc::ProgramSignatureElement> SigParams;
+  SigParams.reserve(Params.size());
+  StringTableBuilder StrTabBuilder((StringTableBuilder::DWARF));
+
+  // Name offsets are from the start of the part. Pre-calculate the offset to
+  // the start of the string table so that it can be added to the table offset.
+  uint32_t TableStart = sizeof(dxbc::ProgramSignatureHeader) +
+                        (sizeof(dxbc::ProgramSignatureElement) * Params.size());
+
+  for (const auto &P : Params) {
+    // zero out the data
+    dxbc::ProgramSignatureElement FinalElement;
+    memset(&FinalElement, 0, sizeof(dxbc::ProgramSignatureElement));
+    FinalElement.Stream = P.Stream;
+    FinalElement.NameOffset =
+        static_cast<uint32_t>(StrTabBuilder.add(P.Name)) + TableStart;
+    FinalElement.Index = P.Index;
+    FinalElement.SystemValue = P.SystemValue;
+    FinalElement.CompType = P.CompType;
+    FinalElement.Register = P.Register;
+    FinalElement.Mask = P.Mask;
+    FinalElement.ExclusiveMask = P.ExclusiveMask;
+    FinalElement.MinPrecision = P.MinPrecision;
+    SigParams.push_back(FinalElement);
+  }
+
+  StrTabBuilder.finalizeInOrder();
+  stable_sort(SigParams, [&](const dxbc::ProgramSignatureElement &L,
+                             const dxbc::ProgramSignatureElement R) {
+    return std::tie(L.Stream, L.Register, L.NameOffset) <
+           std::tie(R.Stream, R.Register, R.NameOffset);
+  });
+  if (sys::IsBigEndianHost)
+    for (auto &El : SigParams)
+      El.swapBytes();
+
+  dxbc::ProgramSignatureHeader Header = {static_cast<uint32_t>(Params.size()),
+                                         sizeof(dxbc::ProgramSignatureHeader)};
+  if (sys::IsBigEndianHost)
+    Header.swapBytes();
+  OS.write(reinterpret_cast<const char *>(&Header),
+           sizeof(dxbc::ProgramSignatureHeader));
+  OS.write(reinterpret_cast<const char *>(SigParams.data()),
+           sizeof(dxbc::ProgramSignatureElement) * SigParams.size());
+  StrTabBuilder.write(OS);
+}
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index a2d61da722af87..bb3492bec8aad8 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -1402,6 +1402,9 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
     // here, but we'd have to emit the pdata, the xdata header, and the
     // epilogue scopes later, since they depend on whether the we need to
     // split the unwind data.
+    //
+    // If this is fixed, remove code in AArch64ISelLowering.cpp that
+    // disables loop alignment on Windows.
     RawFuncLength = GetAbsDifference(streamer, info->FuncletOrFuncEnd,
                                      info->Begin);
   }
diff --git a/llvm/lib/Object/Binary.cpp b/llvm/lib/Object/Binary.cpp
index d18aed8b3b8c8d..0ee9f7fac448a2 100644
--- a/llvm/lib/Object/Binary.cpp
+++ b/llvm/lib/Object/Binary.cpp
@@ -87,6 +87,8 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
   case file_magic::cuda_fatbinary:
   case file_magic::coff_cl_gl_object:
   case file_magic::dxcontainer_object:
+  case file_magic::offload_bundle:
+  case file_magic::offload_bundle_compressed:
     // Unrecognized object file format.
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::offload_binary:
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
index a3071d2c1b72e9..433739fdc27df6 100644
--- a/llvm/lib/Object/DXContainer.cpp
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -101,6 +101,31 @@ Error DXContainer::parsePSVInfo(StringRef Part) {
   return Error::success();
 }
 
+Error DirectX::Signature::initialize(StringRef Part) {
+  dxbc::ProgramSignatureHeader SigHeader;
+  if (Error Err = readStruct(Part, Part.begin(), SigHeader))
+    return Err;
+  size_t Size = sizeof(dxbc::ProgramSignatureElement) * SigHeader.ParamCount;
+
+  if (Part.size() < Size + SigHeader.FirstParamOffset)
+    return parseFailed("Signature parameters extend beyond the part boundary");
+
+  Parameters.Data = Part.substr(SigHeader.FirstParamOffset, Size);
+
+  StringTableOffset = SigHeader.FirstParamOffset + static_cast<uint32_t>(Size);
+  StringTable = Part.substr(SigHeader.FirstParamOffset + Size);
+
+  for (const auto &Param : Parameters) {
+    if (Param.NameOffset < StringTableOffset)
+      return parseFailed("Invalid parameter name offset: name starts before "
+                         "the first name offset");
+    if (Param.NameOffset - StringTableOffset > StringTable.size())
+      return parseFailed("Invalid parameter name offset: name starts after the "
+                         "end of the part data");
+  }
+  return Error::success();
+}
+
 Error DXContainer::parsePartOffsets() {
   uint32_t LastOffset =
       sizeof(dxbc::Header) + (Header.PartCount * sizeof(uint32_t));
@@ -154,6 +179,18 @@ Error DXContainer::parsePartOffsets() {
       if (Error Err = parsePSVInfo(PartData))
         return Err;
       break;
+    case dxbc::PartType::ISG1:
+      if (Error Err = InputSignature.initialize(PartData))
+        return Err;
+      break;
+    case dxbc::PartType::OSG1:
+      if (Error Err = OutputSignature.initialize(PartData))
+        return Err;
+      break;
+    case dxbc::PartType::PSG1:
+      if (Error Err = PatchConstantSignature.initialize(PartData))
+        return Err;
+      break;
     case dxbc::PartType::Unknown:
       break;
     }
diff --git a/llvm/lib/Object/MachOUniversalWriter.cpp b/llvm/lib/Object/MachOUniversalWriter.cpp
index 997235f0b74167..17940495cddd3f 100644
--- a/llvm/lib/Object/MachOUniversalWriter.cpp
+++ b/llvm/lib/Object/MachOUniversalWriter.cpp
@@ -100,7 +100,7 @@ Slice::Slice(const IRObjectFile &IRO, uint32_t CPUType, uint32_t CPUSubType,
 
 Slice::Slice(const MachOObjectFile &O) : Slice(O, calculateAlignment(O)) {}
 
-using MachoCPUTy = std::pair<unsigned, unsigned>;
+using MachoCPUTy = std::pair<uint32_t, uint32_t>;
 
 static Expected<MachoCPUTy> getMachoCPUFromTriple(Triple TT) {
   auto CPU = std::make_pair(MachO::getCPUType(TT), MachO::getCPUSubType(TT));
@@ -117,10 +117,15 @@ static Expected<MachoCPUTy> getMachoCPUFromTriple(StringRef TT) {
   return getMachoCPUFromTriple(Triple{TT});
 }
 
+static MachoCPUTy getMachoCPUFromObjectFile(const MachOObjectFile &O) {
+  return std::make_pair(O.getHeader().cputype, O.getHeader().cpusubtype);
+}
+
 Expected<Slice> Slice::create(const Archive &A, LLVMContext *LLVMCtx) {
   Error Err = Error::success();
   std::unique_ptr<MachOObjectFile> MFO = nullptr;
   std::unique_ptr<IRObjectFile> IRFO = nullptr;
+  std::optional<MachoCPUTy> CPU = std::nullopt;
   for (const Archive::Child &Child : A.children(Err)) {
     Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary(LLVMCtx);
     if (!ChildOrErr)
@@ -134,65 +139,56 @@ Expected<Slice> Slice::create(const Archive &A, LLVMContext *LLVMCtx) {
                                    .c_str());
     if (Bin->isMachO()) {
       MachOObjectFile *O = cast<MachOObjectFile>(Bin);
-      if (IRFO) {
-        return createStringError(
-            std::errc::invalid_argument,
-            "archive member %s is a MachO, while previous archive member "
-            "%s was an IR LLVM object",
-            O->getFileName().str().c_str(), IRFO->getFileName().str().c_str());
-      }
-      if (MFO &&
-          std::tie(MFO->getHeader().cputype, MFO->getHeader().cpusubtype) !=
-              std::tie(O->getHeader().cputype, O->getHeader().cpusubtype)) {
+      MachoCPUTy ObjectCPU = getMachoCPUFromObjectFile(*O);
+
+      if (CPU && CPU != ObjectCPU) {
+        // If CPU != nullptr, one of MFO, IRFO will be != nullptr.
+        StringRef PreviousName = MFO ? MFO->getFileName() : IRFO->getFileName();
         return createStringError(
             std::errc::invalid_argument,
             ("archive member " + O->getFileName() + " cputype (" +
-             Twine(O->getHeader().cputype) + ") and cpusubtype(" +
-             Twine(O->getHeader().cpusubtype) +
+             Twine(ObjectCPU.first) + ") and cpusubtype(" +
+             Twine(ObjectCPU.second) +
              ") does not match previous archive members cputype (" +
-             Twine(MFO->getHeader().cputype) + ") and cpusubtype(" +
-             Twine(MFO->getHeader().cpusubtype) +
-             ") (all members must match) " + MFO->getFileName())
+             Twine(CPU->first) + ") and cpusubtype(" + Twine(CPU->second) +
+             ") (all members must match) " + PreviousName)
                 .str()
                 .c_str());
       }
       if (!MFO) {
         ChildOrErr.get().release();
         MFO.reset(O);
+        if (!CPU)
+          CPU.emplace(ObjectCPU);
       }
     } else if (Bin->isIR()) {
       IRObjectFile *O = cast<IRObjectFile>(Bin);
-      if (MFO) {
-        return createStringError(std::errc::invalid_argument,
-                                 "archive member '%s' is an LLVM IR object, "
-                                 "while previous archive member "
-                                 "'%s' was a MachO",
-                                 O->getFileName().str().c_str(),
-                                 MFO->getFileName().str().c_str());
+      Expected<MachoCPUTy> ObjectCPU =
+          getMachoCPUFromTriple(O->getTargetTriple());
+      if (!ObjectCPU)
+        return ObjectCPU.takeError();
+
+      if (CPU && CPU != *ObjectCPU) {
+        // If CPU != nullptr, one of MFO, IRFO will be != nullptr.
+        StringRef PreviousName =
+            IRFO ? IRFO->getFileName() : MFO->getFileName();
+        return createStringError(
+            std::errc::invalid_argument,
+            ("archive member " + O->getFileName() + " cputype (" +
+             Twine(ObjectCPU->first) + ") and cpusubtype(" +
+             Twine(ObjectCPU->second) +
+             ") does not match previous archive members cputype (" +
+             Twine(CPU->first) + ") and cpusubtype(" + Twine(CPU->second) +
+             ") (all members must match) " + PreviousName)
+                .str()
+                .c_str());
       }
-      if (IRFO) {
-        Expected<MachoCPUTy> CPUO = getMachoCPUFromTriple(O->getTargetTriple());
-        Expected<MachoCPUTy> CPUFO =
-            getMachoCPUFromTriple(IRFO->getTargetTriple());
-        if (!CPUO)
-          return CPUO.takeError();
-        if (!CPUFO)
-          return CPUFO.takeError();
-        if (*CPUO != *CPUFO) {
-          return createStringError(
-              std::errc::invalid_argument,
-              ("archive member " + O->getFileName() + " cputype (" +
-               Twine(CPUO->first) + ") and cpusubtype(" + Twine(CPUO->second) +
-               ") does not match previous archive members cputype (" +
-               Twine(CPUFO->first) + ") and cpusubtype(" +
-               Twine(CPUFO->second) + ") (all members must match) " +
-               IRFO->getFileName())
-                  .str()
-                  .c_str());
-        }
-      } else {
+
+      if (!IRFO) {
         ChildOrErr.get().release();
         IRFO.reset(O);
+        if (!CPU)
+          CPU.emplace(*ObjectCPU);
       }
     } else
       return createStringError(std::errc::invalid_argument,
@@ -281,7 +277,7 @@ buildFatArchList(ArrayRef<Slice> Slices) {
               .str()
               .c_str());
 
-    FatArchTy FatArch;
+    FatArchTy FatArch = {};
     FatArch.cputype = S.getCPUType();
     FatArch.cpusubtype = S.getCPUSubType();
     FatArch.offset = Offset;
diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp
index 2a672467f422dc..428166f58070d0 100644
--- a/llvm/lib/Object/ObjectFile.cpp
+++ b/llvm/lib/Object/ObjectFile.cpp
@@ -158,6 +158,8 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type,
   case file_magic::cuda_fatbinary:
   case file_magic::offload_binary:
   case file_magic::dxcontainer_object:
+  case file_magic::offload_bundle:
+  case file_magic::offload_bundle_compressed:
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::tapi_file:
     return errorCodeToError(object_error::invalid_file_type);
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index e9471adf66eda5..0982c7e2efbb95 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -723,17 +723,21 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
       Info.Name = readString(Ctx);
       if (IsDefined) {
         auto Index = readVaruint32(Ctx);
-        if (Index >= DataSegments.size())
-          return make_error<GenericBinaryError>("invalid data segment index",
-                                                object_error::parse_failed);
         auto Offset = readVaruint64(Ctx);
         auto Size = readVaruint64(Ctx);
-        size_t SegmentSize = DataSegments[Index].Data.Content.size();
-        if (Offset > SegmentSize)
-          return make_error<GenericBinaryError>(
-              "invalid data symbol offset: `" + Info.Name + "` (offset: " +
-                  Twine(Offset) + " segment size: " + Twine(SegmentSize) + ")",
-              object_error::parse_failed);
+        if (!(Info.Flags & wasm::WASM_SYMBOL_ABSOLUTE)) {
+          if (static_cast<size_t>(Index) >= DataSegments.size())
+            return make_error<GenericBinaryError>(
+                "invalid data segment index: " + Twine(Index),
+                object_error::parse_failed);
+          size_t SegmentSize = DataSegments[Index].Data.Content.size();
+          if (Offset > SegmentSize)
+            return make_error<GenericBinaryError>(
+                "invalid data symbol offset: `" + Info.Name +
+                    "` (offset: " + Twine(Offset) +
+                    " segment size: " + Twine(SegmentSize) + ")",
+                object_error::parse_failed);
+        }
         Info.DataRef = wasm::WasmDataReference{Index, Offset, Size};
       }
       break;
diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
index 6b0e1790888854..09a5e41c71234f 100644
--- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
@@ -244,6 +244,20 @@ void DXContainerWriter::writeParts(raw_ostream &OS) {
       PSV.write(OS, P.Info->Version);
       break;
     }
+    case dxbc::PartType::ISG1:
+    case dxbc::PartType::OSG1:
+    case dxbc::PartType::PSG1: {
+      mcdxbc::Signature Sig;
+      if (P.Signature.has_value()) {
+        for (const auto &Param : P.Signature->Parameters) {
+          Sig.addParam(Param.Stream, Param.Name, Param.Index, Param.SystemValue,
+                       Param.CompType, Param.Register, Param.Mask,
+                       Param.ExclusiveMask, Param.MinPrecision);
+        }
+      }
+      Sig.write(OS);
+      break;
+    }
     case dxbc::PartType::Unknown:
       break; // Skip any handling for unrecognized parts.
     }
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index c7cf1ec9afc1f6..1f03f2c7d39966 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -159,6 +159,24 @@ void MappingTraits<DXContainerYAML::PSVInfo>::mapping(
     IO.mapRequired("PatchOutputMap", PSV.PatchOutputMap);
 }
 
+void MappingTraits<DXContainerYAML::SignatureParameter>::mapping(
+    IO &IO, DXContainerYAML::SignatureParameter &S) {
+  IO.mapRequired("Stream", S.Stream);
+  IO.mapRequired("Name", S.Name);
+  IO.mapRequired("Index", S.Index);
+  IO.mapRequired("SystemValue", S.SystemValue);
+  IO.mapRequired("CompType", S.CompType);
+  IO.mapRequired("Register", S.Register);
+  IO.mapRequired("Mask", S.Mask);
+  IO.mapRequired("ExclusiveMask", S.ExclusiveMask);
+  IO.mapRequired("MinPrecision", S.MinPrecision);
+}
+
+void MappingTraits<DXContainerYAML::Signature>::mapping(
+    IO &IO, DXContainerYAML::Signature &S) {
+  IO.mapRequired("Parameters", S.Parameters);
+}
+
 void MappingTraits<DXContainerYAML::Part>::mapping(IO &IO,
                                                    DXContainerYAML::Part &P) {
   IO.mapRequired("Name", P.Name);
@@ -167,6 +185,7 @@ void MappingTraits<DXContainerYAML::Part>::mapping(IO &IO,
   IO.mapOptional("Flags", P.Flags);
   IO.mapOptional("Hash", P.Hash);
   IO.mapOptional("PSVInfo", P.Info);
+  IO.mapOptional("Signature", P.Signature);
 }
 
 void MappingTraits<DXContainerYAML::Object>::mapping(
@@ -224,6 +243,24 @@ void ScalarEnumerationTraits<dxbc::PSV::InterpolationMode>::enumeration(
     IO.enumCase(Value, E.Name.str().c_str(), E.Value);
 }
 
+void ScalarEnumerationTraits<dxbc::D3DSystemValue>::enumeration(
+    IO &IO, dxbc::D3DSystemValue &Value) {
+  for (const auto &E : dxbc::getD3DSystemValues())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::SigMinPrecision>::enumeration(
+    IO &IO, dxbc::SigMinPrecision &Value) {
+  for (const auto &E : dxbc::getSigMinPrecisions())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarEnumerationTraits<dxbc::SigComponentType>::enumeration(
+    IO &IO, dxbc::SigComponentType &Value) {
+  for (const auto &E : dxbc::getSigComponentTypes())
+    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
 } // namespace yaml
 
 void DXContainerYAML::PSVInfo::mapInfoForVersion(yaml::IO &IO) {
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index ef47766a239420..9502fe5e407788 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -518,7 +518,9 @@ void MappingTraits<WasmYAML::SymbolInfo>::mapping(IO &IO,
     IO.mapRequired("Tag", Info.ElementIndex);
   } else if (Info.Kind == wasm::WASM_SYMBOL_TYPE_DATA) {
     if ((Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) {
-      IO.mapRequired("Segment", Info.DataRef.Segment);
+      if ((Info.Flags & wasm::WASM_SYMBOL_ABSOLUTE) == 0) {
+        IO.mapRequired("Segment", Info.DataRef.Segment);
+      }
       IO.mapOptional("Offset", Info.DataRef.Offset, 0u);
       IO.mapRequired("Size", Info.DataRef.Size);
     }
@@ -573,6 +575,7 @@ void ScalarBitSetTraits<WasmYAML::SymbolFlags>::bitset(
   BCaseMask(EXPLICIT_NAME, EXPLICIT_NAME);
   BCaseMask(NO_STRIP, NO_STRIP);
   BCaseMask(TLS, TLS);
+  BCaseMask(ABSOLUTE, ABSOLUTE);
 #undef BCaseMask
 }
 
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index e46b73525f55ae..2a4ed54c2bc30d 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -223,19 +223,59 @@ Expected<int64_t> CounterMappingContext::evaluate(const Counter &C) const {
 }
 
 unsigned CounterMappingContext::getMaxCounterID(const Counter &C) const {
-  switch (C.getKind()) {
-  case Counter::Zero:
-    return 0;
-  case Counter::CounterValueReference:
-    return C.getCounterID();
-  case Counter::Expression: {
-    if (C.getExpressionID() >= Expressions.size())
-      return 0;
-    const auto &E = Expressions[C.getExpressionID()];
-    return std::max(getMaxCounterID(E.LHS), getMaxCounterID(E.RHS));
-  }
+  struct StackElem {
+    Counter ICounter;
+    int64_t LHS = 0;
+    enum {
+      KNeverVisited = 0,
+      KVisitedOnce = 1,
+      KVisitedTwice = 2,
+    } VisitCount = KNeverVisited;
+  };
+
+  std::stack<StackElem> CounterStack;
+  CounterStack.push({C});
+
+  int64_t LastPoppedValue;
+
+  while (!CounterStack.empty()) {
+    StackElem &Current = CounterStack.top();
+
+    switch (Current.ICounter.getKind()) {
+    case Counter::Zero:
+      LastPoppedValue = 0;
+      CounterStack.pop();
+      break;
+    case Counter::CounterValueReference:
+      LastPoppedValue = Current.ICounter.getCounterID();
+      CounterStack.pop();
+      break;
+    case Counter::Expression: {
+      if (Current.ICounter.getExpressionID() >= Expressions.size()) {
+        LastPoppedValue = 0;
+        CounterStack.pop();
+      } else {
+        const auto &E = Expressions[Current.ICounter.getExpressionID()];
+        if (Current.VisitCount == StackElem::KNeverVisited) {
+          CounterStack.push(StackElem{E.LHS});
+          Current.VisitCount = StackElem::KVisitedOnce;
+        } else if (Current.VisitCount == StackElem::KVisitedOnce) {
+          Current.LHS = LastPoppedValue;
+          CounterStack.push(StackElem{E.RHS});
+          Current.VisitCount = StackElem::KVisitedTwice;
+        } else {
+          int64_t LHS = Current.LHS;
+          int64_t RHS = LastPoppedValue;
+          LastPoppedValue = std::max(LHS, RHS);
+          CounterStack.pop();
+        }
+      }
+      break;
+    }
+    }
   }
-  llvm_unreachable("Unhandled CounterKind");
+
+  return LastPoppedValue;
 }
 
 void FunctionRecordIterator::skipOtherFiles() {
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index e82cb5c535f1f5..1375713cb3f805 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -989,7 +989,7 @@ void ValueProfRecord::swapBytes(support::endianness Old,
   if (Old == New)
     return;
 
-  if (getHostEndianness() != Old) {
+  if (llvm::endianness::native != Old) {
     sys::swapByteOrder<uint32_t>(NumValueSites);
     sys::swapByteOrder<uint32_t>(Kind);
   }
@@ -1001,7 +1001,7 @@ void ValueProfRecord::swapBytes(support::endianness Old,
     sys::swapByteOrder<uint64_t>(VD[I].Value);
     sys::swapByteOrder<uint64_t>(VD[I].Count);
   }
-  if (getHostEndianness() == Old) {
+  if (llvm::endianness::native == Old) {
     sys::swapByteOrder<uint32_t>(NumValueSites);
     sys::swapByteOrder<uint32_t>(Kind);
   }
@@ -1086,7 +1086,7 @@ ValueProfData::getValueProfData(const unsigned char *D,
 void ValueProfData::swapBytesToHost(support::endianness Endianness) {
   using namespace support;
 
-  if (Endianness == getHostEndianness())
+  if (Endianness == llvm::endianness::native)
     return;
 
   sys::swapByteOrder<uint32_t>(TotalSize);
@@ -1094,7 +1094,7 @@ void ValueProfData::swapBytesToHost(support::endianness Endianness) {
 
   ValueProfRecord *VR = getFirstValueProfRecord(this);
   for (uint32_t K = 0; K < NumValueKinds; K++) {
-    VR->swapBytes(Endianness, getHostEndianness());
+    VR->swapBytes(Endianness, llvm::endianness::native);
     VR = getValueProfRecordNext(VR);
   }
 }
@@ -1102,13 +1102,13 @@ void ValueProfData::swapBytesToHost(support::endianness Endianness) {
 void ValueProfData::swapBytesFromHost(support::endianness Endianness) {
   using namespace support;
 
-  if (Endianness == getHostEndianness())
+  if (Endianness == llvm::endianness::native)
     return;
 
   ValueProfRecord *VR = getFirstValueProfRecord(this);
   for (uint32_t K = 0; K < NumValueKinds; K++) {
     ValueProfRecord *NVR = getValueProfRecordNext(VR);
-    VR->swapBytes(getHostEndianness(), Endianness);
+    VR->swapBytes(llvm::endianness::native, Endianness);
     VR = NVR;
   }
   sys::swapByteOrder<uint32_t>(TotalSize);
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 9375678f804616..125184d4020840 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -38,7 +38,7 @@
 
 using namespace llvm;
 
-// Extracts the variant information from the top 8 bits in the version and
+// Extracts the variant information from the top 32 bits in the version and
 // returns an enum specifying the variants present.
 static InstrProfKind getProfileKindFromVersion(uint64_t Version) {
   InstrProfKind ProfileKind = InstrProfKind::Unknown;
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index b74d5c3862d803..2fd09713c9b050 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -722,7 +722,7 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       for (uint32_t I = 0; I < ND; I++) {
         if (VK == IPVK_IndirectCallTarget)
-          OS << Symtab.getFuncNameOrExternalSymbol(VD[I].Value) << ":"
+          OS << Symtab.getFuncOrVarNameIfDefined(VD[I].Value) << ":"
              << VD[I].Count << "\n";
         else
           OS << VD[I].Value << ":" << VD[I].Count << "\n";
@@ -790,7 +790,7 @@ void InstrProfWriter::writeTextTemporalProfTraceData(raw_fd_ostream &OS,
   for (auto &Trace : TemporalProfTraces) {
     OS << "# Weight:\n" << Trace.Weight << "\n";
     for (auto &NameRef : Trace.FunctionNameRefs)
-      OS << Symtab.getFuncName(NameRef) << ",";
+      OS << Symtab.getFuncOrVarName(NameRef) << ",";
     OS << "\n";
   }
   OS << "\n";
diff --git a/llvm/lib/Support/FileOutputBuffer.cpp b/llvm/lib/Support/FileOutputBuffer.cpp
index c11ee59da0dda3..58a06a34e8cf39 100644
--- a/llvm/lib/Support/FileOutputBuffer.cpp
+++ b/llvm/lib/Support/FileOutputBuffer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Memory.h"
+#include "llvm/Support/TimeProfiler.h"
 #include <system_error>
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
@@ -43,6 +44,8 @@ class OnDiskBuffer : public FileOutputBuffer {
   size_t getBufferSize() const override { return Buffer.size(); }
 
   Error commit() override {
+    llvm::TimeTraceScope timeScope("Commit buffer to disk");
+
     // Unmap buffer, letting OS flush dirty pages to file on disk.
     Buffer.unmap();
 
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index c4f50b7773b75d..72d33e1e65c8f5 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -106,6 +106,7 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
 
     {"zdinx", RISCVExtensionVersion{1, 0}},
 
+    {"zfa", RISCVExtensionVersion{1, 0}},
     {"zfh", RISCVExtensionVersion{1, 0}},
     {"zfhmin", RISCVExtensionVersion{1, 0}},
     {"zfinx", RISCVExtensionVersion{1, 0}},
@@ -166,7 +167,6 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
 
     {"zacas", RISCVExtensionVersion{1, 0}},
 
-    {"zfa", RISCVExtensionVersion{0, 2}},
     {"zfbfmin", RISCVExtensionVersion{0, 8}},
 
     {"zicfilp", RISCVExtensionVersion{0, 2}},
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8e5ded8dd47aa1..3ae7a893ca4e9e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1051,7 +1051,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Set required alignment.
   setMinFunctionAlignment(Align(4));
   // Set preferred alignments.
-  setPrefLoopAlignment(STI.getPrefLoopAlignment());
+
+  // Don't align loops on Windows. The SEH unwind info generation needs to
+  // know the exact length of functions before the alignments have been
+  // expanded.
+  if (!Subtarget->isTargetWindows())
+    setPrefLoopAlignment(STI.getPrefLoopAlignment());
   setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
   setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
 
@@ -11808,7 +11813,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
   if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
        (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
-      ShuffleVectorInst::isReverseMask(ShuffleMask)) {
+      ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
     SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
     return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
                        DAG.getConstant(8, dl, MVT::i32));
@@ -25825,6 +25830,79 @@ AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
   }
 }
 
+static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
+                                         ArrayRef<int> ShuffleMask, EVT VT,
+                                         EVT ContainerVT, SelectionDAG &DAG) {
+  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  SDLoc DL(Op);
+  unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
+  unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
+  bool IsSingleOp =
+      ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
+
+  // Ignore two operands if no SVE2 or all index numbers couldn't
+  // be represented.
+  if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize))
+    return SDValue();
+
+  EVT VTOp1 = Op.getOperand(0).getValueType();
+  unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
+  unsigned IndexLen = MinSVESize / BitsPerElt;
+  unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
+  unsigned MaskSize = ShuffleMask.size();
+  uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
+  assert(ElementsPerVectorReg <= IndexLen && MaskSize <= IndexLen &&
+         "Incorrectly legalised shuffle operation");
+  (void)MaskSize;
+
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Index : ShuffleMask) {
+    // Handling poison index value.
+    if (Index < 0)
+      Index = 0;
+    // If we refer to the second operand then we have to add elements
+    // number in hardware register minus number of elements in a type.
+    if ((unsigned)Index >= ElementsPerVectorReg)
+      Index += IndexLen - ElementsPerVectorReg;
+    // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
+    // to 255, this might point to the last element of in the second operand
+    // of the shufflevector, thus we are rejecting this transform.
+    if ((unsigned)Index >= MaxOffset)
+      return SDValue();
+    TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
+  }
+
+  // Choosing an out-of-range index leads to the lane being zeroed vs zero
+  // value where it would perform first lane duplication for out of
+  // index elements. For i8 elements an out-of-range index could be a valid
+  // for 2048-bit vector register size.
+  for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i)
+    TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
+
+  EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt);
+  EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
+  EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
+  SDValue VecMask =
+      DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
+  SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
+
+  SDValue Shuffle;
+  if (IsSingleOp)
+    Shuffle =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+                    DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
+                    Op1, SVEMask);
+  else if (Subtarget.hasSVE2())
+    Shuffle =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+                    DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
+                    Op1, Op2, SVEMask);
+  else
+    llvm_unreachable("Cannot lower shuffle without SVE2 TBL");
+  Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
     SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -25944,7 +26022,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
   unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
   unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
   if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
-    if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) {
+    if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
+        Op2.isUndef()) {
       Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
       return convertFromScalableVector(DAG, VT, Op);
     }
@@ -25970,6 +26049,11 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
     }
   }
 
+  // Avoid producing TBL instruction if we don't know SVE register minimal size.
+  if (MinSVESize)
+    return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
+                                     DAG);
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index cd377d659ad3ee..5293df90b880b8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1482,12 +1482,14 @@ def : InstAlias<"xpaclri", (XPACLRI), 0>;
 
 // Pseudos
 
+let Uses = [LR, SP], Defs = [LR] in {
 // Insertion point of LR signing code.
 def PAUTH_PROLOGUE : Pseudo<(outs), (ins), []>, Sched<[]>;
 // Insertion point of LR authentication code.
 // The RET terminator of the containing machine basic block may be replaced
 // with a combined RETA(A|B) instruction when rewriting this Pseudo.
 def PAUTH_EPILOGUE : Pseudo<(outs), (ins), []>, Sched<[]>;
+}
 
 // These pointer authentication instructions require armv8.3a
 let Predicates = [HasPAuth] in {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 3d818c76bd4b7d..fcc30a7cfceaf4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -200,7 +200,7 @@ static cl::opt<bool> EnableGISelLoadStoreOptPostLegal(
 static cl::opt<bool>
     EnableSinkFold("aarch64-enable-sink-fold",
                    cl::desc("Enable sinking and folding of instruction copies"),
-                   cl::init(false), cl::Hidden);
+                   cl::init(true), cl::Hidden);
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   // Register the target.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e6209ca12a48c3..cded28054f5925 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2461,6 +2461,25 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
             FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
       return AdjustCost(Entry->Cost);
 
+  if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
+      CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
+      TLI->getTypeAction(Src->getContext(), SrcTy) ==
+          TargetLowering::TypePromoteInteger &&
+      TLI->getTypeAction(Dst->getContext(), DstTy) ==
+          TargetLowering::TypeSplitVector) {
+    // The standard behaviour in the backend for these cases is to split the
+    // extend up into two parts:
+    //  1. Perform an extending load or masked load up to the legal type.
+    //  2. Extend the loaded data to the final type.
+    std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
+    Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
+    InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
+        Opcode, LegalTy, Src, CCH, CostKind, I);
+    InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
+        Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
+    return Part1 + Part2;
+  }
+
   // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
   // but we also want to include the TTI::CastContextHint::Masked case too.
   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
@@ -3558,11 +3577,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   // into smaller vectors and sum the cost of each shuffle.
   if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
       Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
-      cast<FixedVectorType>(Tp)->getNumElements() >
-          LT.second.getVectorNumElements() &&
-      !Index && !SubTp) {
-    unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
-    assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
+      Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
+    unsigned TpNumElts = Mask.size();
     unsigned LTNumElts = LT.second.getVectorNumElements();
     unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
     VectorType *NTp =
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e0837b689607cc..1c7a09696e853e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2251,7 +2251,7 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
     // Before selecting a DUP instruction, check if it is better selected as a
     // MOV or load from a constant pool.
     Register Src = I.getOperand(1).getReg();
-    auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI);
+    auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI);
     if (!ValAndVReg)
       return false;
     LLVMContext &Ctx = MF.getFunction().getContext();
@@ -5600,8 +5600,7 @@ MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
   if (DstSize == 128) {
     if (Bits.getHiBits(64) != Bits.getLoBits(64))
       return nullptr;
-    // Need to deal with 4f32
-    Op = AArch64::FMOVv2f64_ns;
+    Op = AArch64::FMOVv4f32_ns;
     IsWide = true;
   } else {
     Op = AArch64::FMOVv2f32_ns;
@@ -5610,9 +5609,10 @@ MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
 
   if (AArch64_AM::isAdvSIMDModImmType11(Val)) {
-    Val = AArch64_AM::encodeAdvSIMDModImmType7(Val);
+    Val = AArch64_AM::encodeAdvSIMDModImmType11(Val);
   } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) {
     Val = AArch64_AM::encodeAdvSIMDModImmType12(Val);
+    Op = AArch64::FMOVv2f64_ns;
   } else
     return nullptr;
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 11c314dc88def7..03cbd272757e78 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -591,6 +591,27 @@ static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
 // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
 // cmode: 1110, op: 1
 static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+#if defined(_MSC_VER) && _MSC_VER == 1937 && !defined(__clang__) &&            \
+    defined(_M_ARM64)
+  // The MSVC compiler 19.37 for ARM64 has an optimization bug that
+  // causes an incorrect behavior with the orignal version. Work around
+  // by using a slightly different variation.
+  // https://developercommunity.visualstudio.com/t/C-ARM64-compiler-optimization-bug/10481261
+  constexpr uint64_t Mask = 0xFFULL;
+  uint64_t ByteA = (Imm >> 56) & Mask;
+  uint64_t ByteB = (Imm >> 48) & Mask;
+  uint64_t ByteC = (Imm >> 40) & Mask;
+  uint64_t ByteD = (Imm >> 32) & Mask;
+  uint64_t ByteE = (Imm >> 24) & Mask;
+  uint64_t ByteF = (Imm >> 16) & Mask;
+  uint64_t ByteG = (Imm >> 8) & Mask;
+  uint64_t ByteH = Imm & Mask;
+
+  return (ByteA == 0ULL || ByteA == Mask) && (ByteB == 0ULL || ByteB == Mask) &&
+         (ByteC == 0ULL || ByteC == Mask) && (ByteD == 0ULL || ByteD == Mask) &&
+         (ByteE == 0ULL || ByteE == Mask) && (ByteF == 0ULL || ByteF == Mask) &&
+         (ByteG == 0ULL || ByteG == Mask) && (ByteH == 0ULL || ByteH == Mask);
+#else
   uint64_t ByteA = Imm & 0xff00000000000000ULL;
   uint64_t ByteB = Imm & 0x00ff000000000000ULL;
   uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
@@ -608,6 +629,7 @@ static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
          (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
          (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
          (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+#endif
 }
 
 static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index f49945da0da218..5296415ab4c36d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -830,7 +830,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
         if (CCmp->isNullValue()) {
           return IC.replaceInstUsesWith(
-              II, ConstantExpr::getSExt(CCmp, II.getType()));
+              II, IC.Builder.CreateSExt(CCmp, II.getType()));
         }
 
         // The result of V_ICMP/V_FCMP assembly instructions (which this
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index cec35d1147bb0a..4297111930de25 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -574,11 +574,11 @@ class MUBUF_Store_Pseudo <string opName,
 
 multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> {
 
-  def _OFFSET : GCNPat <
+  def : GCNPat <
     (st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)),
     (!cast<MUBUF_Pseudo>(BaseInst # _OFFSET) store_vt:$vdata, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
 
-  def _ADDR64 : GCNPat <
+  def : GCNPat <
     (st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)),
     (!cast<MUBUF_Pseudo>(BaseInst # _ADDR64) store_vt:$vdata, i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
 }
@@ -912,10 +912,22 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", vt, load_global>;
+}
+
+foreach vt = VReg_96.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", vt, load_global>;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>;
+}
 
 defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
   "buffer_store_byte", i32
@@ -938,10 +950,22 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
 
 defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_BYTE", i32, truncstorei8_global>;
 defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_SHORT", i32, truncstorei16_global>;
-defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", i32, store_global>;
-defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", v2i32, store_global>;
-defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", v3i32, store_global>;
-defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", v4i32, store_global>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", vt, store_global>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", vt, store_global>;
+}
+
+foreach vt = VReg_96.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", vt, store_global>;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>;
+}
 
 defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
   "buffer_atomic_swap", VGPR_32, i32
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 2216acf128bfa5..60cd9d4c3c35a2 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -152,6 +152,13 @@ class SIFixSGPRCopies : public MachineFunctionPass {
 
   void processPHINode(MachineInstr &MI);
 
+  // Check if MO is an immediate materialized into a VGPR, and if so replace it
+  // with an SGPR immediate. The VGPR immediate is also deleted if it does not
+  // have any other uses.
+  bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst,
+                              MachineBasicBlock *BlockToInsertTo,
+                              MachineBasicBlock::iterator PointToInsertTo);
+
   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -662,13 +669,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
                              : MBB;
               MachineBasicBlock::iterator PointToInsertCopy =
                   MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I;
-              MachineInstr *NewCopy =
-                  BuildMI(*BlockToInsertCopy, PointToInsertCopy,
-                          PointToInsertCopy->getDebugLoc(),
-                          TII->get(AMDGPU::COPY), NewDst)
-                      .addReg(MO.getReg());
-              MO.setReg(NewDst);
-              analyzeVGPRToSGPRCopy(NewCopy);
+
+              if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertCopy,
+                                          PointToInsertCopy)) {
+                MachineInstr *NewCopy =
+                    BuildMI(*BlockToInsertCopy, PointToInsertCopy,
+                            PointToInsertCopy->getDebugLoc(),
+                            TII->get(AMDGPU::COPY), NewDst)
+                        .addReg(MO.getReg());
+                MO.setReg(NewDst);
+                analyzeVGPRToSGPRCopy(NewCopy);
+              }
             }
           }
         }
@@ -829,6 +840,32 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   }
 }
 
+bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR(
+    MachineOperand &MaybeVGPRConstMO, Register DstReg,
+    MachineBasicBlock *BlockToInsertTo,
+    MachineBasicBlock::iterator PointToInsertTo) {
+
+  MachineInstr *DefMI = MRI->getVRegDef(MaybeVGPRConstMO.getReg());
+  if (!DefMI || !DefMI->isMoveImmediate())
+    return false;
+
+  MachineOperand *SrcConst = TII->getNamedOperand(*DefMI, AMDGPU::OpName::src0);
+  if (SrcConst->isReg())
+    return false;
+
+  const TargetRegisterClass *SrcRC =
+      MRI->getRegClass(MaybeVGPRConstMO.getReg());
+  unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC);
+  unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+  BuildMI(*BlockToInsertTo, PointToInsertTo, PointToInsertTo->getDebugLoc(),
+          TII->get(MoveOp), DstReg)
+      .add(*SrcConst);
+  if (MRI->hasOneUse(MaybeVGPRConstMO.getReg()))
+    DefMI->eraseFromParent();
+  MaybeVGPRConstMO.setReg(DstReg);
+  return true;
+}
+
 bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
                                        MachineBasicBlock::iterator &I) {
   Register DstReg = MI.getOperand(0).getReg();
@@ -846,25 +883,10 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
               TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
           .add(MI.getOperand(1));
       MI.getOperand(1).setReg(TmpReg);
-    } else {
-      MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-      if (DefMI && DefMI->isMoveImmediate()) {
-        MachineOperand SrcConst = DefMI->getOperand(AMDGPU::getNamedOperandIdx(
-            DefMI->getOpcode(), AMDGPU::OpName::src0));
-        if (!SrcConst.isReg()) {
-          const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
-          unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC);
-          unsigned MoveOp =
-              MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
-          BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(MoveOp),
-                  DstReg)
-              .add(SrcConst);
-          I = std::next(I);
-          if (MRI->hasOneUse(SrcReg))
-            DefMI->eraseFromParent();
-          MI.eraseFromParent();
-        }
-      }
+    } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(),
+                                      MI)) {
+      I = std::next(I);
+      MI.eraseFromParent();
     }
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f170428b38c49a..0a32844bdb01a0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10810,6 +10810,7 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
 // performed.
 static const std::optional<ByteProvider<SDValue>>
 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
+                 std::optional<bool> IsSigned = std::nullopt,
                  unsigned Depth = 0) {
   // We may need to recursively traverse a series of SRLs
   if (Depth >= 6)
@@ -10821,12 +10822,16 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
 
   switch (Op->getOpcode()) {
   case ISD::TRUNCATE: {
-    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+                            Depth + 1);
   }
 
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND_INREG: {
+    IsSigned = IsSigned.value_or(false) ||
+               Op->getOpcode() == ISD::SIGN_EXTEND ||
+               Op->getOpcode() == ISD::SIGN_EXTEND_INREG;
     SDValue NarrowOp = Op->getOperand(0);
     auto NarrowVT = NarrowOp.getValueType();
     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
@@ -10839,7 +10844,8 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
 
     if (SrcIndex >= NarrowByteWidth)
       return std::nullopt;
-    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+                            Depth + 1);
   }
 
   case ISD::SRA:
@@ -10855,11 +10861,24 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
 
     SrcIndex += BitShift / 8;
 
-    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+                            Depth + 1);
   }
 
   default: {
-    return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+    if (auto A = dyn_cast<AtomicSDNode>(Op) || Op->isMemIntrinsic()) {
+      // If this causes us to throw away signedness info, then fail.
+      if (IsSigned)
+        return std::nullopt;
+      return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+    }
+
+    if (auto L = dyn_cast<LoadSDNode>(Op))
+      if (L->getExtensionType() != ISD::NON_EXTLOAD)
+        IsSigned =
+            IsSigned.value_or(false) || L->getExtensionType() == ISD::SEXTLOAD;
+
+    return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex, IsSigned);
   }
   }
   llvm_unreachable("fully handled switch");
@@ -10873,7 +10892,8 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
 // performed. \p StartingIndex is the originally requested byte of the Or
 static const std::optional<ByteProvider<SDValue>>
 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
-                      unsigned StartingIndex = 0) {
+                      unsigned StartingIndex = 0,
+                      std::optional<bool> IsSigned = std::nullopt) {
   // Finding Src tree of RHS of or typically requires at least 1 additional
   // depth
   if (Depth > 6)
@@ -10888,11 +10908,11 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
   switch (Op.getOpcode()) {
   case ISD::OR: {
     auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
-                                     StartingIndex);
+                                     StartingIndex, IsSigned);
     if (!RHS)
       return std::nullopt;
     auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
-                                     StartingIndex);
+                                     StartingIndex, IsSigned);
     if (!LHS)
       return std::nullopt;
     // A well formed Or will have two ByteProviders for each byte, one of which
@@ -10923,7 +10943,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
       return ByteProvider<SDValue>::getConstantZero();
     }
 
-    return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
+    return calculateSrcByte(Op->getOperand(0), StartingIndex, Index, IsSigned);
   }
 
   case ISD::FSHR: {
@@ -10972,7 +10992,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     // the SRL is Index + ByteShift
     return BytesProvided - ByteShift > Index
                ? calculateSrcByte(Op->getOperand(0), StartingIndex,
-                                  Index + ByteShift)
+                                  Index + ByteShift, IsSigned)
                : ByteProvider<SDValue>::getConstantZero();
   }
 
@@ -10993,7 +11013,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     return Index < ByteShift
                ? ByteProvider<SDValue>::getConstantZero()
                : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
-                                       Depth + 1, StartingIndex);
+                                       Depth + 1, StartingIndex, IsSigned);
   }
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
@@ -11013,12 +11033,21 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
       return std::nullopt;
     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
 
+    IsSigned =
+        Op->getOpcode() != ISD::ANY_EXTEND
+            ? std::optional<bool>(IsSigned.value_or(false) ||
+                                  Op->getOpcode() == ISD::SIGN_EXTEND ||
+                                  Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
+                                  Op->getOpcode() == ISD::AssertSext)
+            : IsSigned;
+
     if (Index >= NarrowByteWidth)
       return Op.getOpcode() == ISD::ZERO_EXTEND
                  ? std::optional<ByteProvider<SDValue>>(
                        ByteProvider<SDValue>::getConstantZero())
                  : std::nullopt;
-    return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
+    return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex,
+                                 IsSigned);
   }
 
   case ISD::TRUNCATE: {
@@ -11026,7 +11055,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
 
     if (NarrowByteWidth >= Index) {
       return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
-                                   StartingIndex);
+                                   StartingIndex, IsSigned);
     }
 
     return std::nullopt;
@@ -11034,13 +11063,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
 
   case ISD::CopyFromReg: {
     if (BitWidth / 8 > Index)
-      return calculateSrcByte(Op, StartingIndex, Index);
+      return calculateSrcByte(Op, StartingIndex, Index, IsSigned);
 
     return std::nullopt;
   }
 
   case ISD::LOAD: {
     auto L = cast<LoadSDNode>(Op.getNode());
+
+    // Only set IsSigned if the load is extended.
+    if (L->getExtensionType() != ISD::NON_EXTLOAD)
+      IsSigned =
+          IsSigned.value_or(false) || L->getExtensionType() == ISD::SEXTLOAD;
     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
     if (NarrowBitWidth % 8 != 0)
       return std::nullopt;
@@ -11057,7 +11091,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     }
 
     if (NarrowByteWidth > Index) {
-      return calculateSrcByte(Op, StartingIndex, Index);
+      return calculateSrcByte(Op, StartingIndex, Index, IsSigned);
     }
 
     return std::nullopt;
@@ -11065,7 +11099,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
 
   case ISD::BSWAP:
     return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
-                                 Depth + 1, StartingIndex);
+                                 Depth + 1, StartingIndex, IsSigned);
 
   case ISD::EXTRACT_VECTOR_ELT: {
     auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
@@ -11080,7 +11114,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     }
 
     return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
-                            StartingIndex, Index);
+                            StartingIndex, Index, IsSigned);
   }
 
   case AMDGPUISD::PERM: {
@@ -11096,9 +11130,10 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
     auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
     auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
 
-    return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
-                           : ByteProvider<SDValue>(
-                                 ByteProvider<SDValue>::getConstantZero());
+    return IdxMask != 0x0c
+               ? calculateSrcByte(NextOp, StartingIndex, NextIndex, IsSigned)
+               : ByteProvider<SDValue>(
+                     ByteProvider<SDValue>::getConstantZero());
   }
 
   default: {
@@ -12712,6 +12747,221 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
   return Accum;
 }
 
+// Collect the ultimate src of each of the mul node's operands, and confirm
+// each operand is 8 bytes.
+static std::optional<ByteProvider<SDValue>>
+handleMulOperand(const SDValue &MulOperand) {
+  auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
+  if (!Byte0 || Byte0->isConstantZero()) {
+    return std::nullopt;
+  }
+  auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
+  if (Byte1 && !Byte1->isConstantZero()) {
+    return std::nullopt;
+  }
+  return Byte0;
+}
+
+static unsigned addPermMasks(unsigned First, unsigned Second) {
+  unsigned FirstCs = First & 0x0c0c0c0c;
+  unsigned SecondCs = Second & 0x0c0c0c0c;
+  unsigned FirstNoCs = First & ~0x0c0c0c0c;
+  unsigned SecondNoCs = Second & ~0x0c0c0c0c;
+
+  assert(FirstCs & 0xFF | SecondCs & 0xFF);
+  assert(FirstCs & 0xFF00 | SecondCs & 0xFF00);
+  assert(FirstCs & 0xFF0000 | SecondCs & 0xFF0000);
+  assert(FirstCs & 0xFF000000 | SecondCs & 0xFF000000);
+
+  return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
+}
+
+static void placeSources(ByteProvider<SDValue> &Src0,
+                         ByteProvider<SDValue> &Src1,
+                         SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
+                         SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
+                         int Step) {
+
+  assert(Src0.Src.has_value() && Src1.Src.has_value());
+  // Src0s and Src1s are empty, just place arbitrarily.
+  if (Step == 0) {
+    Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
+    Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
+    return;
+  }
+
+  for (int BPI = 0; BPI < 2; BPI++) {
+    std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
+    if (BPI == 1) {
+      BPP = {Src1, Src0};
+    }
+    unsigned ZeroMask = 0x0c0c0c0c;
+    unsigned FMask = 0xFF << (8 * (3 - Step));
+
+    unsigned FirstMask =
+        BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+    unsigned SecondMask =
+        BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+    // Attempt to find Src vector which contains our SDValue, if so, add our
+    // perm mask to the existing one. If we are unable to find a match for the
+    // first SDValue, attempt to find match for the second.
+    int FirstGroup = -1;
+    for (int I = 0; I < 2; I++) {
+      SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+          I == 0 ? Src0s : Src1s;
+      auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+        return IterElt.first == *BPP.first.Src;
+      };
+
+      auto Match = std::find_if(Srcs.begin(), Srcs.end(), MatchesFirst);
+      if (Match != Srcs.end()) {
+        Match->second = addPermMasks(FirstMask, Match->second);
+        FirstGroup = I;
+        break;
+      }
+    }
+    if (FirstGroup != -1) {
+      SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+          FirstGroup == 1 ? Src0s : Src1s;
+      auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+        return IterElt.first == *BPP.second.Src;
+      };
+      auto Match = std::find_if(Srcs.begin(), Srcs.end(), MatchesSecond);
+      if (Match != Srcs.end()) {
+        Match->second = addPermMasks(SecondMask, Match->second);
+      } else
+        Srcs.push_back({*BPP.second.Src, SecondMask});
+      return;
+    }
+  }
+
+  // If we have made it here, then we could not find a match in Src0s or Src1s
+  // for either Src0 or Src1, so just place them arbitrarily.
+
+  unsigned ZeroMask = 0x0c0c0c0c;
+  unsigned FMask = 0xFF << (8 * (3 - Step));
+
+  Src0s.push_back(
+      {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+  Src1s.push_back(
+      {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+
+  return;
+}
+
+static SDValue
+resolveSources(SelectionDAG &DAG, SDLoc SL,
+               SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+               bool IsSigned, bool IsAny) {
+
+  // If we just have one source, just permute it accordingly.
+  if (Srcs.size() == 1) {
+    auto Elt = Srcs.begin();
+    auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32);
+
+    // v_perm will produce the original value.
+    if (Elt->second == 0x3020100)
+      return EltVal;
+
+    return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+                       DAG.getConstant(Elt->second, SL, MVT::i32));
+  }
+
+  auto FirstElt = Srcs.begin();
+  auto SecondElt = std::next(FirstElt);
+
+  SmallVector<SDValue, 2> Perms;
+
+  // If we have multiple sources in the chain, combine them via perms (using
+  // calculated perm mask) and Ors.
+  while (true) {
+    auto FirstMask = FirstElt->second;
+    auto SecondMask = SecondElt->second;
+
+    unsigned FirstCs = FirstMask & 0x0c0c0c0c;
+    unsigned FirstPlusFour = FirstMask | 0x04040404;
+    // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
+    // original 0x0C.
+    FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
+
+    auto PermMask = addPermMasks(FirstMask, SecondMask);
+    auto FirstVal =
+        DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+    auto SecondVal =
+        DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32);
+
+    Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
+                                SecondVal,
+                                DAG.getConstant(PermMask, SL, MVT::i32)));
+
+    FirstElt = std::next(SecondElt);
+    if (FirstElt == Srcs.end())
+      break;
+
+    SecondElt = std::next(FirstElt);
+    // If we only have a FirstElt, then just combine that into the cumulative
+    // source node.
+    if (SecondElt == Srcs.end()) {
+      auto EltVal =
+          DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+
+      Perms.push_back(
+          DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+                      DAG.getConstant(FirstElt->second, SL, MVT::i32)));
+      break;
+    }
+  }
+
+  assert(Perms.size() == 1 || Perms.size() == 2);
+  return Perms.size() == 2
+             ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
+             : Perms[0];
+}
+
+static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+                     unsigned ChainLength) {
+  for (auto &[EntryVal, EntryMask] : Srcs) {
+    EntryMask = EntryMask >> ((4 - ChainLength) * 8);
+    auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
+    EntryMask += ZeroMask;
+  }
+}
+
+static bool isMul(const SDValue Op) {
+  auto Opcode = Op.getOpcode();
+
+  return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
+          Opcode == AMDGPUISD::MUL_I24);
+}
+
+static std::optional<bool> checkSignedness(const SDValue &N,
+                                           ByteProvider<SDValue> &Src0,
+                                           ByteProvider<SDValue> &Src1) {
+  auto MulOpcode = N.getOpcode();
+  std::optional<bool> IterIsSigned;
+  // Both sides of the tree must have the same signedness semantics.
+  if ((Src0.IsSigned != Src1.IsSigned) ||
+      (Src0.IsSigned.value_or(false) != Src1.IsSigned.value_or(false)))
+    return IterIsSigned;
+  // If we have a MUL_U24 op with signed semantics, then fail.
+  if (Src0.IsSigned.value_or(false) && MulOpcode == AMDGPUISD::MUL_U24)
+    return IterIsSigned;
+  // If we have a MUL_I24 op with unsigned semantics, then fail.
+  if (!Src0.IsSigned.value_or(true) && MulOpcode == AMDGPUISD::MUL_I24)
+    return IterIsSigned;
+
+  bool TopLevelSignedness =
+      MulOpcode == AMDGPUISD::MUL_I24 ||
+      (MulOpcode == ISD::MUL && N.getNode()->getFlags().hasNoSignedWrap() &&
+       !N.getNode()->getFlags().hasNoUnsignedWrap());
+
+  // In cases where we are accumulating into an i8 (for v_dot4), the
+  // ByteProvider will not have signedness info since the MSBs are dont-cares.
+  // In this case, we simply use the TopLevelSignedness of the instruction.
+  IterIsSigned = Src0.IsSigned.value_or(TopLevelSignedness);
+  return IterIsSigned;
+}
+
 SDValue SITargetLowering::performAddCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -12725,14 +12975,142 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
       if (SDValue Folded = tryFoldToMad64_32(N, DCI))
         return Folded;
     }
-
-    return SDValue();
   }
 
   if (SDValue V = reassociateScalarOps(N, DAG)) {
     return V;
   }
 
+  if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
+      (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
+    SDValue TempNode(N, 0);
+    std::optional<bool> IsSigned;
+    SmallVector<std::pair<SDValue, unsigned>, 4> Src0s;
+    SmallVector<std::pair<SDValue, unsigned>, 4> Src1s;
+    SmallVector<SDValue, 4> Src2s;
+
+    // Match the v_dot4 tree, while collecting src nodes.
+    int ChainLength = 0;
+    for (int I = 0; I < 4; I++) {
+      auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
+      if (MulIdx == -1)
+        break;
+      auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
+      if (!Src0)
+        break;
+      auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
+      if (!Src1)
+        break;
+
+      auto IterIsSigned =
+          checkSignedness(TempNode->getOperand(MulIdx), *Src0, *Src1);
+      if (!IterIsSigned)
+        break;
+      if (!IsSigned)
+        IsSigned = *IterIsSigned;
+      if (*IterIsSigned != *IsSigned)
+        break;
+      placeSources(*Src0, *Src1, Src0s, Src1s, I);
+      auto AddIdx = 1 - MulIdx;
+      // Allow the special case where add (add (mul24, 0), mul24) became ->
+      // add (mul24, mul24).
+      if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
+        Src2s.push_back(TempNode->getOperand(AddIdx));
+        auto Src0 =
+            handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
+        if (!Src0)
+          break;
+        auto Src1 =
+            handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
+        if (!Src1)
+          break;
+        auto IterIsSigned =
+            checkSignedness(TempNode->getOperand(AddIdx), *Src0, *Src1);
+        if (!IterIsSigned)
+          break;
+        assert(IsSigned);
+        if (*IterIsSigned != *IsSigned)
+          break;
+        placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
+        Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
+        ChainLength = I + 2;
+        break;
+      }
+
+      TempNode = TempNode->getOperand(AddIdx);
+      Src2s.push_back(TempNode);
+      ChainLength = I + 1;
+      if (TempNode->getNumOperands() < 2)
+        break;
+      LHS = TempNode->getOperand(0);
+      RHS = TempNode->getOperand(1);
+    }
+
+    if (ChainLength < 2)
+      return SDValue();
+
+    // Masks were constructed with assumption that we would find a chain of
+    // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
+    // 0x0c) so they do not affect dot calculation.
+    if (ChainLength < 4) {
+      fixMasks(Src0s, ChainLength);
+      fixMasks(Src1s, ChainLength);
+    }
+
+    SDValue Src0, Src1;
+
+    // If we are just using a single source for both, and have permuted the
+    // bytes consistently, we can just use the sources without permuting
+    // (commutation).
+    bool UseOriginalSrc = false;
+    if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
+        Src0s.begin()->second == Src1s.begin()->second &&
+        Src0s.begin()->first.getValueSizeInBits() == 32 &&
+        Src1s.begin()->first.getValueSizeInBits() == 32) {
+      SmallVector<unsigned, 4> SrcBytes;
+      auto Src0Mask = Src0s.begin()->second;
+      SrcBytes.push_back(Src0Mask & 0xFF000000);
+      bool UniqueEntries = true;
+      for (auto I = 1; I < 4; I++) {
+        auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
+
+        if (is_contained(SrcBytes, NextByte)) {
+          UniqueEntries = false;
+          break;
+        }
+        SrcBytes.push_back(NextByte);
+      }
+
+      if (UniqueEntries) {
+        UseOriginalSrc = true;
+        // Must be 32 bits to enter above conditional.
+        assert(Src0s.begin()->first.getValueSizeInBits() == 32);
+        assert(Src1s.begin()->first.getValueSizeInBits() == 32);
+        Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
+        Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
+      }
+    }
+
+    if (!UseOriginalSrc) {
+      Src0 = resolveSources(DAG, SL, Src0s, false, true);
+      Src1 = resolveSources(DAG, SL, Src1s, false, true);
+    }
+
+    assert(IsSigned);
+    SDValue Src2 =
+        DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
+
+    SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
+                                                  : Intrinsic::amdgcn_udot4,
+                                        SL, MVT::i64);
+
+    assert(!VT.isVector());
+    auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
+                           Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
+
+    return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
+  }
+
   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
     return SDValue();
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2799a3e78b04d2..792f4695d288b5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3250,9 +3250,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
-    // We should only expect these to be on src0 due to canonicalization.
-    if (Src0->isReg() && Src0->getReg() == Reg) {
-      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
+    if ((Src0->isReg() && Src0->getReg() == Reg) ||
+        (Src1->isReg() && Src1->getReg() == Reg)) {
+      MachineOperand *RegSrc =
+          Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
+      if (!RegSrc->isReg() ||
+          RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())))
         return false;
 
       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
@@ -3266,18 +3269,22 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       if (pseudoToMCOpcode(NewOpc) == -1)
         return false;
 
-      // We need to swap operands 0 and 1 since madmk constant is at operand 1.
+      // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+      // would also require restricting their register classes. For now
+      // just bail out.
+      if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
+        return false;
 
       const int64_t Imm = ImmOp->getImm();
 
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
-      Register Src1Reg = Src1->getReg();
-      unsigned Src1SubReg = Src1->getSubReg();
-      Src0->setReg(Src1Reg);
-      Src0->setSubReg(Src1SubReg);
-      Src0->setIsKill(Src1->isKill());
+      Register SrcReg = RegSrc->getReg();
+      unsigned SrcSubReg = RegSrc->getSubReg();
+      Src0->setReg(SrcReg);
+      Src0->setSubReg(SrcSubReg);
+      Src0->setIsKill(RegSrc->isKill());
 
       if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
           Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
@@ -3346,6 +3353,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       if (pseudoToMCOpcode(NewOpc) == -1)
         return false;
 
+      // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+      // would also require restricting their register classes. For now
+      // just bail out.
+      if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
+        return false;
+
       const int64_t Imm = ImmOp->getImm();
 
       // FIXME: This would be a lot easier if we could return a new instruction
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e065b104db4eb3..d0a81673d6528c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2432,10 +2432,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         if (Offset == 0) {
           unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
                                                : AMDGPU::V_LSHRREV_B32_e64;
-          // XXX - This never happens because of emergency scavenging slot at 0?
-          auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg)
-                           .addImm(ST.getWavefrontSizeLog2())
-                           .addReg(FrameReg);
+          auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
+          if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
+            // For V_LSHRREV, the operands are reversed (the shift count goes
+            // first).
+            Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
+          else
+            Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
           if (IsSALU && !LiveSCC)
             Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
           if (IsSALU && LiveSCC) {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 52ce370bbacc64..8cca167e32ee79 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -8375,7 +8375,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   unsigned EltSize = VT.getScalarSizeInBits();
   if (EltSize >= 32 ||
       ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
-      ShuffleVectorInst::isIdentityMask(M) ||
+      ShuffleVectorInst::isIdentityMask(M, M.size()) ||
       isVREVMask(M, VT, 64) ||
       isVREVMask(M, VT, 32) ||
       isVREVMask(M, VT, 16))
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index b9d27c78ce8e41..2640ad9e362673 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -43,7 +43,6 @@ void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI,
                                const DebugLoc &DL, MCRegister DestReg,
                                MCRegister SrcReg, bool KillSrc) const {
-  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
   const AVRRegisterInfo &TRI = *STI.getRegisterInfo();
   unsigned Opc;
 
@@ -496,9 +495,7 @@ unsigned AVRInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     const MachineFunction &MF = *MI.getParent()->getParent();
     const AVRTargetMachine &TM =
         static_cast<const AVRTargetMachine &>(MF.getTarget());
-    const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
     const TargetInstrInfo &TII = *STI.getInstrInfo();
-
     return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
                                   *TM.getMCAsmInfo());
   }
@@ -542,7 +539,7 @@ bool AVRInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
     llvm_unreachable("unexpected opcode!");
   case AVR::JMPk:
   case AVR::CALLk:
-    return true;
+    return STI.hasJMPCALL();
   case AVR::RCALLk:
   case AVR::RJMPk:
     return isIntN(13, BrOffset);
@@ -573,7 +570,10 @@ void AVRInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   if (STI.hasJMPCALL())
     BuildMI(&MBB, DL, get(AVR::JMPk)).addMBB(&NewDestBB);
   else
-    report_fatal_error("cannot create long jump without FeatureJMPCALL");
+    // The RJMP may jump to a far place beyond its legal range. We let the
+    // linker to report 'out of range' rather than crash, or silently emit
+    // incorrect assembly code.
+    BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(&NewDestBB);
 }
 
 } // end of namespace llvm
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 224c7b281ac446..6501c17dd810e6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -220,7 +220,7 @@ BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk,
 }
 
 // 1.7.3 SC-Form
-class SCForm<bits<6> opcode, bits<1> xo,
+class SCForm<bits<6> opcode, bits<1> xo1, bits<1> xo2,
                      dag OOL, dag IOL, string asmstr, InstrItinClass itin,
                      list<dag> pattern>
   : I<opcode, OOL, IOL, asmstr, itin> {
@@ -229,7 +229,8 @@ class SCForm<bits<6> opcode, bits<1> xo,
   let Pattern = pattern;
 
   let Inst{20-26} = LEV;
-  let Inst{30}    = xo;
+  let Inst{30}    = xo1;
+  let Inst{31}    = xo2;
 }
 
 // 1.7.4 D-Form
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 0567665fbc7349..a97062e0c643fb 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1641,10 +1641,18 @@ let isBranch = 1, isTerminator = 1, Size = 0 in {
 
 // System call.
 let PPC970_Unit = 7 in {
-  def SC     : SCForm<17, 1, (outs), (ins i32imm:$LEV),
+  def SC     : SCForm<17, 1, 0, (outs), (ins i32imm:$LEV),
                       "sc $LEV", IIC_BrB, [(PPCsc (i32 imm:$LEV))]>;
 }
 
+// We mark SCV as having no scheduling model since it is only meant to be used
+// as inline assembly. If we implement a builtin pattern for it we will need to
+// add it to the P9 and P10 scheduling models.
+let Predicates = [IsISA3_0], hasNoSchedulingInfo = 1 in {
+  def SCV : SCForm<17, 0, 1, (outs), (ins i32imm:$LEV),
+                   "scv $LEV", IIC_BrB, []>;
+}
+
 // Branch history rolling buffer.
 def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB,
                       [(PPCclrbhrb)]>,
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 76a73436d61b54..d7a51891087e41 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -113,7 +113,7 @@ struct PPCMIPeephole : public MachineFunctionPass {
   MachineDominatorTree *MDT;
   MachinePostDominatorTree *MPDT;
   MachineBlockFrequencyInfo *MBFI;
-  uint64_t EntryFreq;
+  BlockFrequency EntryFreq;
   SmallSet<Register, 16> RegsToUpdate;
 
   // Initialize class variables.
@@ -300,7 +300,7 @@ void PPCMIPeephole::UpdateTOCSaves(
     PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
 
     MachineBasicBlock *Entry = &MF->front();
-    uint64_t CurrBlockFreq = MBFI->getBlockFreq(MI->getParent()).getFrequency();
+    BlockFrequency CurrBlockFreq = MBFI->getBlockFreq(MI->getParent());
 
     // If the block in which the TOC save resides is in a block that
     // post-dominates Entry, or a block that is hotter than entry (keep in mind
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index f12e868e74264f..4479bccfd45e39 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -169,6 +169,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
 
   getActionDefinitionsBuilder(G_ABS).lower();
 
+  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+
   getLegacyLegalizerInfo().computeTables();
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index 0a42c6faee2900..d71efc11e6a9fc 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -273,7 +273,7 @@ int RISCVLoadFPImm::getLoadFPImm(APFloat FPImm) {
   if (Sign) {
     if (Entry == 16)
       return 0;
-    return false;
+    return -1;
   }
 
   return Entry;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index dc00fdfc2146a8..403bd727684f68 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -438,6 +438,39 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
   return Res;
 }
 
+InstSeq generateTwoRegInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures,
+                              unsigned &ShiftAmt, unsigned &AddOpc) {
+  int64_t LoVal = SignExtend64<32>(Val);
+  if (LoVal == 0)
+    return RISCVMatInt::InstSeq();
+
+  // Subtract the LoVal to emulate the effect of the final ADD.
+  uint64_t Tmp = (uint64_t)Val - (uint64_t)LoVal;
+  assert(Tmp != 0);
+
+  // Use trailing zero counts to figure how far we need to shift LoVal to line
+  // up with the remaining constant.
+  // TODO: This algorithm assumes all non-zero bits in the low 32 bits of the
+  // final constant come from LoVal.
+  unsigned TzLo = llvm::countr_zero((uint64_t)LoVal);
+  unsigned TzHi = llvm::countr_zero(Tmp);
+  assert(TzLo < 32 && TzHi >= 32);
+  ShiftAmt = TzHi - TzLo;
+  AddOpc = RISCV::ADD;
+
+  if (Tmp == ((uint64_t)LoVal << ShiftAmt))
+    return RISCVMatInt::generateInstSeq(LoVal, ActiveFeatures);
+
+  // If we have Zba, we can use (ADD_UW X, (SLLI X, 32)).
+  if (ActiveFeatures[RISCV::FeatureStdExtZba] && Lo_32(Val) == Hi_32(Val)) {
+    ShiftAmt = 32;
+    AddOpc = RISCV::ADD_UW;
+    return RISCVMatInt::generateInstSeq(LoVal, ActiveFeatures);
+  }
+
+  return RISCVMatInt::InstSeq();
+}
+
 int getIntMatCost(const APInt &Val, unsigned Size,
                   const FeatureBitset &ActiveFeatures, bool CompressionCost) {
   bool IsRV64 = ActiveFeatures[RISCV::Feature64Bit];
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
index ae7b8d402184d9..072b30f2a06484 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
@@ -48,6 +48,14 @@ using InstSeq = SmallVector<Inst, 8>;
 // instruction selection.
 InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures);
 
+// Helper to generate an instruction sequence that can materialize the given
+// immediate value into a register using an additional temporary register. This
+// handles cases where the constant can be generated by (ADD (SLLI X, C), X) or
+// (ADD_UW (SLLI X, C) X). The sequence to generate X is returned. ShiftAmt is
+// provides the SLLI and AddOpc indicates ADD or ADD_UW.
+InstSeq generateTwoRegInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures,
+                              unsigned &ShiftAmt, unsigned &AddOpc);
+
 // Helper to estimate the number of instructions required to materialise the
 // given immediate value into a register. This estimate does not account for
 // `Val` possibly fitting into an immediate, and so may over-estimate.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3f099198f2a5f9..3d3486b7fa8956 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -159,7 +159,7 @@ def HasStdExtZhinxOrZhinxmin
                                    "'Zhinxmin' (Half Float in Integer Minimal)">;
 
 def FeatureStdExtZfa
-    : SubtargetFeature<"experimental-zfa", "HasStdExtZfa", "true",
+    : SubtargetFeature<"zfa", "HasStdExtZfa", "true",
                        "'Zfa' (Additional Floating-Point)",
                        [FeatureStdExtF]>;
 def HasStdExtZfa : Predicate<"Subtarget->hasStdExtZfa()">,
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index e90b8d49a09684..9bf1e12584aee3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -195,29 +195,23 @@ static SDValue selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
   RISCVMatInt::InstSeq Seq =
       RISCVMatInt::generateInstSeq(Imm, Subtarget.getFeatureBits());
 
-  // See if we can create this constant as (ADD (SLLI X, 32), X) where X is at
+  // See if we can create this constant as (ADD (SLLI X, C), X) where X is at
   // worst an LUI+ADDIW. This will require an extra register, but avoids a
   // constant pool.
   // If we have Zba we can use (ADD_UW X, (SLLI X, 32)) to handle cases where
   // low and high 32 bits are the same and bit 31 and 63 are set.
   if (Seq.size() > 3) {
-    int64_t LoVal = SignExtend64<32>(Imm);
-    int64_t HiVal = SignExtend64<32>(((uint64_t)Imm - (uint64_t)LoVal) >> 32);
-    if (LoVal == HiVal ||
-        (Subtarget.hasStdExtZba() && Lo_32(Imm) == Hi_32(Imm))) {
-      RISCVMatInt::InstSeq SeqLo =
-          RISCVMatInt::generateInstSeq(LoVal, Subtarget.getFeatureBits());
-      if ((SeqLo.size() + 2) < Seq.size()) {
-        SDValue Lo = selectImmSeq(CurDAG, DL, VT, SeqLo);
-
-        SDValue SLLI = SDValue(
-            CurDAG->getMachineNode(RISCV::SLLI, DL, VT, Lo,
-                                   CurDAG->getTargetConstant(32, DL, VT)),
-            0);
-        // Prefer ADD when possible.
-        unsigned AddOpc = (LoVal == HiVal) ? RISCV::ADD : RISCV::ADD_UW;
-        return SDValue(CurDAG->getMachineNode(AddOpc, DL, VT, Lo, SLLI), 0);
-      }
+    unsigned ShiftAmt, AddOpc;
+    RISCVMatInt::InstSeq SeqLo = RISCVMatInt::generateTwoRegInstSeq(
+        Imm, Subtarget.getFeatureBits(), ShiftAmt, AddOpc);
+    if (!SeqLo.empty() && (SeqLo.size() + 2) < Seq.size()) {
+      SDValue Lo = selectImmSeq(CurDAG, DL, VT, SeqLo);
+
+      SDValue SLLI = SDValue(
+          CurDAG->getMachineNode(RISCV::SLLI, DL, VT, Lo,
+                                 CurDAG->getTargetConstant(ShiftAmt, DL, VT)),
+          0);
+      return SDValue(CurDAG->getMachineNode(AddOpc, DL, VT, Lo, SLLI), 0);
     }
   }
 
@@ -2785,120 +2779,9 @@ static bool vectorPseudoHasAllNBitUsers(SDNode *User, unsigned UserOpNo,
   if (UserOpNo == VLIdx)
     return false;
 
-  // TODO: Handle Zvbb instructions
-  switch (PseudoInfo->BaseInstr) {
-  default:
-    return false;
-
-  // 11.6. Vector Single-Width Shift Instructions
-  case RISCV::VSLL_VX:
-  case RISCV::VSRL_VX:
-  case RISCV::VSRA_VX:
-  // 12.4. Vector Single-Width Scaling Shift Instructions
-  case RISCV::VSSRL_VX:
-  case RISCV::VSSRA_VX:
-    // Only the low lg2(SEW) bits of the shift-amount value are used.
-    if (Bits < Log2SEW)
-      return false;
-    break;
-
-  // 11.7 Vector Narrowing Integer Right Shift Instructions
-  case RISCV::VNSRL_WX:
-  case RISCV::VNSRA_WX:
-  // 12.5. Vector Narrowing Fixed-Point Clip Instructions
-  case RISCV::VNCLIPU_WX:
-  case RISCV::VNCLIP_WX:
-    // Only the low lg2(2*SEW) bits of the shift-amount value are used.
-    if (Bits < Log2SEW + 1)
-      return false;
-    break;
-
-  // 11.1. Vector Single-Width Integer Add and Subtract
-  case RISCV::VADD_VX:
-  case RISCV::VSUB_VX:
-  case RISCV::VRSUB_VX:
-  // 11.2. Vector Widening Integer Add/Subtract
-  case RISCV::VWADDU_VX:
-  case RISCV::VWSUBU_VX:
-  case RISCV::VWADD_VX:
-  case RISCV::VWSUB_VX:
-  case RISCV::VWADDU_WX:
-  case RISCV::VWSUBU_WX:
-  case RISCV::VWADD_WX:
-  case RISCV::VWSUB_WX:
-  // 11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
-  case RISCV::VADC_VXM:
-  case RISCV::VADC_VIM:
-  case RISCV::VMADC_VXM:
-  case RISCV::VMADC_VIM:
-  case RISCV::VMADC_VX:
-  case RISCV::VSBC_VXM:
-  case RISCV::VMSBC_VXM:
-  case RISCV::VMSBC_VX:
-  // 11.5 Vector Bitwise Logical Instructions
-  case RISCV::VAND_VX:
-  case RISCV::VOR_VX:
-  case RISCV::VXOR_VX:
-  // 11.8. Vector Integer Compare Instructions
-  case RISCV::VMSEQ_VX:
-  case RISCV::VMSNE_VX:
-  case RISCV::VMSLTU_VX:
-  case RISCV::VMSLT_VX:
-  case RISCV::VMSLEU_VX:
-  case RISCV::VMSLE_VX:
-  case RISCV::VMSGTU_VX:
-  case RISCV::VMSGT_VX:
-  // 11.9. Vector Integer Min/Max Instructions
-  case RISCV::VMINU_VX:
-  case RISCV::VMIN_VX:
-  case RISCV::VMAXU_VX:
-  case RISCV::VMAX_VX:
-  // 11.10. Vector Single-Width Integer Multiply Instructions
-  case RISCV::VMUL_VX:
-  case RISCV::VMULH_VX:
-  case RISCV::VMULHU_VX:
-  case RISCV::VMULHSU_VX:
-  // 11.11. Vector Integer Divide Instructions
-  case RISCV::VDIVU_VX:
-  case RISCV::VDIV_VX:
-  case RISCV::VREMU_VX:
-  case RISCV::VREM_VX:
-  // 11.12. Vector Widening Integer Multiply Instructions
-  case RISCV::VWMUL_VX:
-  case RISCV::VWMULU_VX:
-  case RISCV::VWMULSU_VX:
-  // 11.13. Vector Single-Width Integer Multiply-Add Instructions
-  case RISCV::VMACC_VX:
-  case RISCV::VNMSAC_VX:
-  case RISCV::VMADD_VX:
-  case RISCV::VNMSUB_VX:
-  // 11.14. Vector Widening Integer Multiply-Add Instructions
-  case RISCV::VWMACCU_VX:
-  case RISCV::VWMACC_VX:
-  case RISCV::VWMACCSU_VX:
-  case RISCV::VWMACCUS_VX:
-  // 11.15. Vector Integer Merge Instructions
-  case RISCV::VMERGE_VXM:
-  // 11.16. Vector Integer Move Instructions
-  case RISCV::VMV_V_X:
-  // 12.1. Vector Single-Width Saturating Add and Subtract
-  case RISCV::VSADDU_VX:
-  case RISCV::VSADD_VX:
-  case RISCV::VSSUBU_VX:
-  case RISCV::VSSUB_VX:
-  // 12.2. Vector Single-Width Averaging Add and Subtract
-  case RISCV::VAADDU_VX:
-  case RISCV::VAADD_VX:
-  case RISCV::VASUBU_VX:
-  case RISCV::VASUB_VX:
-  // 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
-  case RISCV::VSMUL_VX:
-  // 16.1. Integer Scalar Move Instructions
-  case RISCV::VMV_S_X:
-    if (Bits < (1U << Log2SEW))
-      return false;
-  }
-  return true;
+  auto NumDemandedBits =
+      RISCV::getVectorLowDemandedScalarBits(PseudoInfo->BaseInstr, Log2SEW);
+  return NumDemandedBits && Bits >= *NumDemandedBits;
 }
 
 // Return true if all users of this SDNode* only consume the lower \p Bits.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5e3975df1c4425..bd4150c87eabbd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -779,15 +779,22 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       // Splice
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
 
+      if (Subtarget.hasStdExtZvkb()) {
+        setOperationAction(ISD::BSWAP, VT, Legal);
+        setOperationAction(ISD::VP_BSWAP, VT, Custom);
+      } else {
+        setOperationAction({ISD::BSWAP, ISD::VP_BSWAP}, VT, Expand);
+        setOperationAction({ISD::ROTL, ISD::ROTR}, VT, Expand);
+      }
+
       if (Subtarget.hasStdExtZvbb()) {
-        setOperationAction({ISD::BITREVERSE, ISD::BSWAP}, VT, Legal);
-        setOperationAction({ISD::VP_BITREVERSE, ISD::VP_BSWAP}, VT, Custom);
+        setOperationAction(ISD::BITREVERSE, VT, Legal);
+        setOperationAction(ISD::VP_BITREVERSE, VT, Custom);
         setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
                             ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
                            VT, Custom);
       } else {
-        setOperationAction({ISD::BITREVERSE, ISD::BSWAP}, VT, Expand);
-        setOperationAction({ISD::VP_BITREVERSE, ISD::VP_BSWAP}, VT, Expand);
+        setOperationAction({ISD::BITREVERSE, ISD::VP_BITREVERSE}, VT, Expand);
         setOperationAction({ISD::CTLZ, ISD::CTTZ, ISD::CTPOP}, VT, Expand);
         setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
                             ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
@@ -802,8 +809,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                               ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ_ZERO_UNDEF},
                              VT, Custom);
         }
-
-        setOperationAction({ISD::ROTL, ISD::ROTR}, VT, Expand);
       }
     }
 
@@ -1109,11 +1114,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         setOperationAction(IntegerVPOps, VT, Custom);
 
+        if (Subtarget.hasStdExtZvkb())
+          setOperationAction({ISD::BSWAP, ISD::ROTL, ISD::ROTR}, VT, Custom);
+
         if (Subtarget.hasStdExtZvbb()) {
-          setOperationAction({ISD::BITREVERSE, ISD::BSWAP, ISD::CTLZ,
-                              ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ,
-                              ISD::CTTZ_ZERO_UNDEF, ISD::CTPOP, ISD::ROTL,
-                              ISD::ROTR},
+          setOperationAction({ISD::BITREVERSE, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
+                              ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTPOP},
                              VT, Custom);
         } else {
           // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
@@ -2013,14 +2019,17 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     // -0.0 can be created by fmv + fneg.
     return Imm.isZero();
   }
-  // Special case: the cost for -0.0 is 1.
-  int Cost = Imm.isNegZero()
-                 ? 1
-                 : RISCVMatInt::getIntMatCost(Imm.bitcastToAPInt(),
-                                              Subtarget.getXLen(),
-                                              Subtarget.getFeatureBits());
-  // If the constantpool data is already in cache, only Cost 1 is cheaper.
-  return Cost < FPImmCost;
+
+  // Special case: fmv + fneg
+  if (Imm.isNegZero())
+    return true;
+
+  // Building an integer and then converting requires a fmv at the end of
+  // the integer sequence.
+  const int Cost =
+    1 + RISCVMatInt::getIntMatCost(Imm.bitcastToAPInt(), Subtarget.getXLen(),
+                                   Subtarget.getFeatureBits());
+  return Cost <= FPImmCost;
 }
 
 // TODO: This is very conservative.
@@ -2122,7 +2131,7 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
       }
       break;
     case ISD::SETLT:
-      // Convert X < 1 to 0 <= X.
+      // Convert X < 1 to 0 >= X.
       if (C == 1) {
         RHS = LHS;
         LHS = DAG.getConstant(0, DL, RHS.getValueType());
@@ -3386,18 +3395,16 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
       VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget);
       if ((StepOpcode == ISD::MUL && SplatStepVal != 1) ||
           (StepOpcode == ISD::SHL && SplatStepVal != 0)) {
-        SDValue SplatStep = DAG.getSplatBuildVector(
-            VIDVT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT));
+        SDValue SplatStep = DAG.getConstant(SplatStepVal, DL, VIDVT);
         VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep);
       }
       if (StepDenominator != 1) {
-        SDValue SplatStep = DAG.getSplatBuildVector(
-            VIDVT, DL, DAG.getConstant(Log2_64(StepDenominator), DL, XLenVT));
+        SDValue SplatStep =
+            DAG.getConstant(Log2_64(StepDenominator), DL, VIDVT);
         VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep);
       }
       if (Addend != 0 || Negate) {
-        SDValue SplatAddend = DAG.getSplatBuildVector(
-            VIDVT, DL, DAG.getConstant(Addend, DL, XLenVT));
+        SDValue SplatAddend = DAG.getConstant(Addend, DL, VIDVT);
         VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend,
                           VID);
       }
@@ -3530,8 +3537,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
       (NumElts <= 4 || VT.getSizeInBits() > Subtarget.getRealMinVLen())) {
     unsigned SignBits = DAG.ComputeNumSignBits(Op);
     if (EltBitSize - SignBits < 8) {
-      SDValue Source =
-        DAG.getNode(ISD::TRUNCATE, DL, VT.changeVectorElementType(MVT::i8), Op);
+      SDValue Source = DAG.getBuildVector(VT.changeVectorElementType(MVT::i8),
+                                          DL, Op->ops());
       Source = convertToScalableVector(ContainerVT.changeVectorElementType(MVT::i8),
                                        Source, DAG, Subtarget);
       SDValue Res = DAG.getNode(RISCVISD::VSEXT_VL, DL, ContainerVT, Source, Mask, VL);
@@ -4333,7 +4340,8 @@ static SDValue lowerBitreverseShuffle(ShuffleVectorSDNode *SVN,
 
   assert(VT.getVectorElementType() == MVT::i1);
 
-  if (!ShuffleVectorInst::isReverseMask(SVN->getMask()) ||
+  if (!ShuffleVectorInst::isReverseMask(SVN->getMask(),
+                                        SVN->getMask().size()) ||
       !SVN->getOperand(1).isUndef())
     return SDValue();
 
@@ -4376,7 +4384,7 @@ static SDValue lowerBitreverseShuffle(ShuffleVectorSDNode *SVN,
 
 // Given a shuffle mask like <3, 0, 1, 2, 7, 4, 5, 6> for v8i8, we can
 // reinterpret it as a v2i32 and rotate it right by 8 instead. We can lower this
-// as a vror.vi if we have zvbb, or otherwise as a vsll, vsrl and vor.
+// as a vror.vi if we have Zvkb, or otherwise as a vsll, vsrl and vor.
 static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN,
                                            SelectionDAG &DAG,
                                            const RISCVSubtarget &Subtarget) {
@@ -4527,9 +4535,9 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
           lowerVECTOR_SHUFFLEAsVSlidedown(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return V;
 
-  // A bitrotate will be one instruction on zvbb, so try to lower to it first if
+  // A bitrotate will be one instruction on Zvkb, so try to lower to it first if
   // available.
-  if (Subtarget.hasStdExtZvbb())
+  if (Subtarget.hasStdExtZvkb())
     if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
       return V;
 
@@ -4660,7 +4668,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
 
   // We might be able to express the shuffle as a bitrotate. But even if we
-  // don't have zvbb and have to expand, the expanded sequence of approx. 2
+  // don't have Zvkb and have to expand, the expanded sequence of approx. 2
   // shifts and a vor will have a higher throughput than a vrgather.
   if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
     return V;
@@ -4972,24 +4980,22 @@ static SDValue lowerConstant(SDValue Op, SelectionDAG &DAG,
   if (Seq.size() <= Subtarget.getMaxBuildIntsCost())
     return Op;
 
-  // Special case. See if we can build the constant as (ADD (SLLI X, 32), X) do
+  // Optimizations below are disabled for opt size. If we're optimizing for
+  // size, use a constant pool.
+  if (DAG.shouldOptForSize())
+    return SDValue();
+
+  // Special case. See if we can build the constant as (ADD (SLLI X, C), X) do
   // that if it will avoid a constant pool.
   // It will require an extra temporary register though.
   // If we have Zba we can use (ADD_UW X, (SLLI X, 32)) to handle cases where
   // low and high 32 bits are the same and bit 31 and 63 are set.
-  if (!DAG.shouldOptForSize()) {
-    int64_t LoVal = SignExtend64<32>(Imm);
-    int64_t HiVal = SignExtend64<32>(((uint64_t)Imm - (uint64_t)LoVal) >> 32);
-    if (LoVal == HiVal ||
-        (Subtarget.hasStdExtZba() && Lo_32(Imm) == Hi_32(Imm))) {
-      RISCVMatInt::InstSeq SeqLo =
-          RISCVMatInt::generateInstSeq(LoVal, Subtarget.getFeatureBits());
-      if ((SeqLo.size() + 2) <= Subtarget.getMaxBuildIntsCost())
-        return Op;
-    }
-  }
+  unsigned ShiftAmt, AddOpc;
+  RISCVMatInt::InstSeq SeqLo = RISCVMatInt::generateTwoRegInstSeq(
+      Imm, Subtarget.getFeatureBits(), ShiftAmt, AddOpc);
+  if (!SeqLo.empty() && (SeqLo.size() + 2) <= Subtarget.getMaxBuildIntsCost())
+    return Op;
 
-  // Expand to a constant pool using the default expansion code.
   return SDValue();
 }
 
@@ -5484,7 +5490,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::ROTL:
   case ISD::ROTR:
     if (Op.getValueType().isFixedLengthVector()) {
-      assert(Subtarget.hasStdExtZvbb());
+      assert(Subtarget.hasStdExtZvkb());
       return lowerToScalableOp(Op, DAG);
     }
     assert(Subtarget.hasVendorXTHeadBb() &&
@@ -8335,23 +8341,40 @@ static unsigned getRVVReductionOp(unsigned ISDOpcode) {
   switch (ISDOpcode) {
   default:
     llvm_unreachable("Unhandled reduction");
+  case ISD::VP_REDUCE_ADD:
   case ISD::VECREDUCE_ADD:
     return RISCVISD::VECREDUCE_ADD_VL;
+  case ISD::VP_REDUCE_UMAX:
   case ISD::VECREDUCE_UMAX:
     return RISCVISD::VECREDUCE_UMAX_VL;
+  case ISD::VP_REDUCE_SMAX:
   case ISD::VECREDUCE_SMAX:
     return RISCVISD::VECREDUCE_SMAX_VL;
+  case ISD::VP_REDUCE_UMIN:
   case ISD::VECREDUCE_UMIN:
     return RISCVISD::VECREDUCE_UMIN_VL;
+  case ISD::VP_REDUCE_SMIN:
   case ISD::VECREDUCE_SMIN:
     return RISCVISD::VECREDUCE_SMIN_VL;
+  case ISD::VP_REDUCE_AND:
   case ISD::VECREDUCE_AND:
     return RISCVISD::VECREDUCE_AND_VL;
+  case ISD::VP_REDUCE_OR:
   case ISD::VECREDUCE_OR:
     return RISCVISD::VECREDUCE_OR_VL;
+  case ISD::VP_REDUCE_XOR:
   case ISD::VECREDUCE_XOR:
     return RISCVISD::VECREDUCE_XOR_VL;
+  case ISD::VP_REDUCE_FADD:
+    return RISCVISD::VECREDUCE_FADD_VL;
+  case ISD::VP_REDUCE_SEQ_FADD:
+    return RISCVISD::VECREDUCE_SEQ_FADD_VL;
+  case ISD::VP_REDUCE_FMAX:
+    return RISCVISD::VECREDUCE_FMAX_VL;
+  case ISD::VP_REDUCE_FMIN:
+    return RISCVISD::VECREDUCE_FMIN_VL;
   }
+
 }
 
 SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op,
@@ -8583,37 +8606,6 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
                            VectorVal, Mask, VL, DL, DAG, Subtarget);
 }
 
-static unsigned getRVVVPReductionOp(unsigned ISDOpcode) {
-  switch (ISDOpcode) {
-  default:
-    llvm_unreachable("Unhandled reduction");
-  case ISD::VP_REDUCE_ADD:
-    return RISCVISD::VECREDUCE_ADD_VL;
-  case ISD::VP_REDUCE_UMAX:
-    return RISCVISD::VECREDUCE_UMAX_VL;
-  case ISD::VP_REDUCE_SMAX:
-    return RISCVISD::VECREDUCE_SMAX_VL;
-  case ISD::VP_REDUCE_UMIN:
-    return RISCVISD::VECREDUCE_UMIN_VL;
-  case ISD::VP_REDUCE_SMIN:
-    return RISCVISD::VECREDUCE_SMIN_VL;
-  case ISD::VP_REDUCE_AND:
-    return RISCVISD::VECREDUCE_AND_VL;
-  case ISD::VP_REDUCE_OR:
-    return RISCVISD::VECREDUCE_OR_VL;
-  case ISD::VP_REDUCE_XOR:
-    return RISCVISD::VECREDUCE_XOR_VL;
-  case ISD::VP_REDUCE_FADD:
-    return RISCVISD::VECREDUCE_FADD_VL;
-  case ISD::VP_REDUCE_SEQ_FADD:
-    return RISCVISD::VECREDUCE_SEQ_FADD_VL;
-  case ISD::VP_REDUCE_FMAX:
-    return RISCVISD::VECREDUCE_FMAX_VL;
-  case ISD::VP_REDUCE_FMIN:
-    return RISCVISD::VECREDUCE_FMIN_VL;
-  }
-}
-
 SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -8626,7 +8618,7 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
     return SDValue();
 
   MVT VecVT = VecEVT.getSimpleVT();
-  unsigned RVVOpcode = getRVVVPReductionOp(Op.getOpcode());
+  unsigned RVVOpcode = getRVVReductionOp(Op.getOpcode());
 
   if (VecVT.isFixedLengthVector()) {
     auto ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -11122,6 +11114,122 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
+/// Given an integer binary operator, return the generic ISD::VECREDUCE_OP
+/// which corresponds to it.
+static unsigned getVecReduceOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled binary to transfrom reduction");
+  case ISD::ADD:
+    return ISD::VECREDUCE_ADD;
+  case ISD::UMAX:
+    return ISD::VECREDUCE_UMAX;
+  case ISD::SMAX:
+    return ISD::VECREDUCE_SMAX;
+  case ISD::UMIN:
+    return ISD::VECREDUCE_UMIN;
+  case ISD::SMIN:
+    return ISD::VECREDUCE_SMIN;
+  case ISD::AND:
+    return ISD::VECREDUCE_AND;
+  case ISD::OR:
+    return ISD::VECREDUCE_OR;
+  case ISD::XOR:
+    return ISD::VECREDUCE_XOR;
+  }
+}
+
+/// Perform two related transforms whose purpose is to incrementally recognize
+/// an explode_vector followed by scalar reduction as a vector reduction node.
+/// This exists to recover from a deficiency in SLP which can't handle
+/// forests with multiple roots sharing common nodes.  In some cases, one
+/// of the trees will be vectorized, and the other will remain (unprofitably)
+/// scalarized.
+static SDValue
+combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
+                                  const RISCVSubtarget &Subtarget) {
+
+  // This transforms need to run before all integer types have been legalized
+  // to i64 (so that the vector element type matches the add type), and while
+  // it's safe to introduce odd sized vector types.
+  if (DAG.NewNodesMustHaveLegalTypes)
+    return SDValue();
+
+  // Without V, this transform isn't useful.  We could form the (illegal)
+  // operations and let them be scalarized again, but there's really no point.
+  if (!Subtarget.hasVInstructions())
+    return SDValue();
+
+  const SDLoc DL(N);
+  const EVT VT = N->getValueType(0);
+
+  // TODO: Handle floating point here.
+  if (!VT.isInteger())
+    return SDValue();
+
+  const unsigned Opc = N->getOpcode();
+  const unsigned ReduceOpc = getVecReduceOpcode(Opc);
+  assert(Opc == ISD::getVecReduceBaseOpcode(ReduceOpc) &&
+         "Inconsistent mappings");
+  const SDValue LHS = N->getOperand(0);
+  const SDValue RHS = N->getOperand(1);
+
+  if (!LHS.hasOneUse() || !RHS.hasOneUse())
+    return SDValue();
+
+  if (RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(RHS.getOperand(1)))
+    return SDValue();
+
+  SDValue SrcVec = RHS.getOperand(0);
+  EVT SrcVecVT = SrcVec.getValueType();
+  assert(SrcVecVT.getVectorElementType() == VT);
+  if (SrcVecVT.isScalableVector())
+    return SDValue();
+
+  if (SrcVecVT.getScalarSizeInBits() > Subtarget.getELen())
+    return SDValue();
+
+  // match binop (extract_vector_elt V, 0), (extract_vector_elt V, 1) to
+  // reduce_op (extract_subvector [2 x VT] from V).  This will form the
+  // root of our reduction tree. TODO: We could extend this to any two
+  // adjacent constant indices if desired.
+  if (LHS.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      LHS.getOperand(0) == SrcVec && isNullConstant(LHS.getOperand(1)) &&
+      isOneConstant(RHS.getOperand(1))) {
+    EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, 2);
+    SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec,
+                              DAG.getVectorIdxConstant(0, DL));
+    return DAG.getNode(ReduceOpc, DL, VT, Vec);
+  }
+
+  // Match (binop (reduce (extract_subvector V, 0),
+  //                      (extract_vector_elt V, sizeof(SubVec))))
+  // into a reduction of one more element from the original vector V.
+  if (LHS.getOpcode() != ReduceOpc)
+    return SDValue();
+
+  SDValue ReduceVec = LHS.getOperand(0);
+  if (ReduceVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      ReduceVec.hasOneUse() && ReduceVec.getOperand(0) == RHS.getOperand(0) &&
+      isNullConstant(ReduceVec.getOperand(1))) {
+    uint64_t Idx = cast<ConstantSDNode>(RHS.getOperand(1))->getLimitedValue();
+    if (ReduceVec.getValueType().getVectorNumElements() == Idx) {
+      // For illegal types (e.g. 3xi32), most will be combined again into a
+      // wider (hopefully legal) type.  If this is a terminal state, we are
+      // relying on type legalization here to produce something reasonable
+      // and this lowering quality could probably be improved. (TODO)
+      EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, Idx + 1);
+      SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec,
+                                DAG.getVectorIdxConstant(0, DL));
+      return DAG.getNode(ReduceOpc, DL, VT, Vec);
+    }
+  }
+
+  return SDValue();
+}
+
+
 // Try to fold (<bop> x, (reduction.<bop> vec, start))
 static SDValue combineBinOpToReduce(SDNode *N, SelectionDAG &DAG,
                                     const RISCVSubtarget &Subtarget) {
@@ -11449,6 +11557,9 @@ static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
     return V;
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
+  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+    return V;
+
   // fold (add (select lhs, rhs, cc, 0, y), x) ->
   //      (select lhs, rhs, cc, x, (add x, y))
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
@@ -11619,6 +11730,8 @@ static SDValue performANDCombine(SDNode *N,
 
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
+  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+    return V;
 
   if (DCI.isAfterLegalizeDAG())
     if (SDValue V = combineDeMorganOfBoolean(N, DAG))
@@ -11671,6 +11784,8 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
 
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
+  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+    return V;
 
   if (DCI.isAfterLegalizeDAG())
     if (SDValue V = combineDeMorganOfBoolean(N, DAG))
@@ -11722,6 +11837,9 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
 
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
+  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+    return V;
+
   // fold (xor (select cond, 0, y), x) ->
   //      (select cond, x, (xor x, y))
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
@@ -13927,8 +14045,13 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::FMAXNUM:
-  case ISD::FMINNUM:
-    return combineBinOpToReduce(N, DAG, Subtarget);
+  case ISD::FMINNUM: {
+    if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
+      return V;
+    if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+      return V;
+    return SDValue();
+  }
   case ISD::SETCC:
     return performSETCCCombine(N, DAG, Subtarget);
   case ISD::SIGN_EXTEND_INREG:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 6ee5e2d4c58404..90f7a4f1edbe94 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2847,3 +2847,115 @@ bool RISCV::hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2) {
   MachineOperand FrmOp2 = MI2.getOperand(MI2FrmOpIdx);
   return FrmOp1.getImm() == FrmOp2.getImm();
 }
+
+std::optional<unsigned>
+RISCV::getVectorLowDemandedScalarBits(uint16_t Opcode, unsigned Log2SEW) {
+  // TODO: Handle Zvbb instructions
+  switch (Opcode) {
+  default:
+    return std::nullopt;
+
+  // 11.6. Vector Single-Width Shift Instructions
+  case RISCV::VSLL_VX:
+  case RISCV::VSRL_VX:
+  case RISCV::VSRA_VX:
+  // 12.4. Vector Single-Width Scaling Shift Instructions
+  case RISCV::VSSRL_VX:
+  case RISCV::VSSRA_VX:
+    // Only the low lg2(SEW) bits of the shift-amount value are used.
+    return Log2SEW;
+
+  // 11.7 Vector Narrowing Integer Right Shift Instructions
+  case RISCV::VNSRL_WX:
+  case RISCV::VNSRA_WX:
+  // 12.5. Vector Narrowing Fixed-Point Clip Instructions
+  case RISCV::VNCLIPU_WX:
+  case RISCV::VNCLIP_WX:
+    // Only the low lg2(2*SEW) bits of the shift-amount value are used.
+    return Log2SEW + 1;
+
+  // 11.1. Vector Single-Width Integer Add and Subtract
+  case RISCV::VADD_VX:
+  case RISCV::VSUB_VX:
+  case RISCV::VRSUB_VX:
+  // 11.2. Vector Widening Integer Add/Subtract
+  case RISCV::VWADDU_VX:
+  case RISCV::VWSUBU_VX:
+  case RISCV::VWADD_VX:
+  case RISCV::VWSUB_VX:
+  case RISCV::VWADDU_WX:
+  case RISCV::VWSUBU_WX:
+  case RISCV::VWADD_WX:
+  case RISCV::VWSUB_WX:
+  // 11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+  case RISCV::VADC_VXM:
+  case RISCV::VADC_VIM:
+  case RISCV::VMADC_VXM:
+  case RISCV::VMADC_VIM:
+  case RISCV::VMADC_VX:
+  case RISCV::VSBC_VXM:
+  case RISCV::VMSBC_VXM:
+  case RISCV::VMSBC_VX:
+  // 11.5 Vector Bitwise Logical Instructions
+  case RISCV::VAND_VX:
+  case RISCV::VOR_VX:
+  case RISCV::VXOR_VX:
+  // 11.8. Vector Integer Compare Instructions
+  case RISCV::VMSEQ_VX:
+  case RISCV::VMSNE_VX:
+  case RISCV::VMSLTU_VX:
+  case RISCV::VMSLT_VX:
+  case RISCV::VMSLEU_VX:
+  case RISCV::VMSLE_VX:
+  case RISCV::VMSGTU_VX:
+  case RISCV::VMSGT_VX:
+  // 11.9. Vector Integer Min/Max Instructions
+  case RISCV::VMINU_VX:
+  case RISCV::VMIN_VX:
+  case RISCV::VMAXU_VX:
+  case RISCV::VMAX_VX:
+  // 11.10. Vector Single-Width Integer Multiply Instructions
+  case RISCV::VMUL_VX:
+  case RISCV::VMULH_VX:
+  case RISCV::VMULHU_VX:
+  case RISCV::VMULHSU_VX:
+  // 11.11. Vector Integer Divide Instructions
+  case RISCV::VDIVU_VX:
+  case RISCV::VDIV_VX:
+  case RISCV::VREMU_VX:
+  case RISCV::VREM_VX:
+  // 11.12. Vector Widening Integer Multiply Instructions
+  case RISCV::VWMUL_VX:
+  case RISCV::VWMULU_VX:
+  case RISCV::VWMULSU_VX:
+  // 11.13. Vector Single-Width Integer Multiply-Add Instructions
+  case RISCV::VMACC_VX:
+  case RISCV::VNMSAC_VX:
+  case RISCV::VMADD_VX:
+  case RISCV::VNMSUB_VX:
+  // 11.14. Vector Widening Integer Multiply-Add Instructions
+  case RISCV::VWMACCU_VX:
+  case RISCV::VWMACC_VX:
+  case RISCV::VWMACCSU_VX:
+  case RISCV::VWMACCUS_VX:
+  // 11.15. Vector Integer Merge Instructions
+  case RISCV::VMERGE_VXM:
+  // 11.16. Vector Integer Move Instructions
+  case RISCV::VMV_V_X:
+  // 12.1. Vector Single-Width Saturating Add and Subtract
+  case RISCV::VSADDU_VX:
+  case RISCV::VSADD_VX:
+  case RISCV::VSSUBU_VX:
+  case RISCV::VSSUB_VX:
+  // 12.2. Vector Single-Width Averaging Add and Subtract
+  case RISCV::VAADDU_VX:
+  case RISCV::VAADD_VX:
+  case RISCV::VASUBU_VX:
+  case RISCV::VASUB_VX:
+  // 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+  case RISCV::VSMUL_VX:
+  // 16.1. Integer Scalar Move Instructions
+  case RISCV::VMV_S_X:
+    return 1U << Log2SEW;
+  }
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 99c907a98121ae..d56d3c0b303bf9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -265,6 +265,12 @@ int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
 // one of the instructions does not have rounding mode, false will be returned.
 bool hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2);
 
+// If \p Opcode is a .vx vector instruction, returns the lower number of bits
+// that are used from the scalar .x operand for a given \p Log2SEW. Otherwise
+// returns null.
+std::optional<unsigned> getVectorLowDemandedScalarBits(uint16_t Opcode,
+                                                       unsigned Log2SEW);
+
 // Special immediate for AVL operand of V pseudo instructions to indicate VLMax.
 static constexpr int64_t VLMaxSentinel = -1LL;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index f73d85787b67bb..d92d3975d12f53 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2045,6 +2045,56 @@ multiclass VPatWidenFPMulAccVL_VV_VF_RM<SDNode vop, string instruction_name> {
   }
 }
 
+multiclass VPatSlideVL_VX_VI<SDNode vop, string instruction_name> {
+  foreach vti = AllVectors in {
+    let Predicates = GetVTypePredicates<vti>.Predicates in {
+      def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rd),
+                                 (vti.Vector vti.RegClass:$rs1),
+                                 uimm5:$rs2, (vti.Mask V0),
+                                 VLOpFrag, (XLenVT timm:$policy))),
+                (!cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK")
+                    vti.RegClass:$rd, vti.RegClass:$rs1, uimm5:$rs2,
+                    (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                    (XLenVT timm:$policy))>;
+
+      def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rd),
+                                 (vti.Vector vti.RegClass:$rs1),
+                                 GPR:$rs2, (vti.Mask V0),
+                                 VLOpFrag, (XLenVT timm:$policy))),
+                (!cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX#"_MASK")
+                    vti.RegClass:$rd, vti.RegClass:$rs1, GPR:$rs2,
+                    (vti.Mask V0), GPR:$vl, vti.Log2SEW,
+                    (XLenVT timm:$policy))>;
+    }
+  }
+}
+
+multiclass VPatSlide1VL_VX<SDNode vop, string instruction_name> {
+  foreach vti = AllIntegerVectors in {
+    let Predicates = GetVTypePredicates<vti>.Predicates in {
+      def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rs3),
+                                 (vti.Vector vti.RegClass:$rs1),
+                                 GPR:$rs2, (vti.Mask V0), VLOpFrag)),
+                (!cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX#"_MASK")
+                    vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
+                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
+    }
+  }
+}
+
+multiclass VPatSlide1VL_VF<SDNode vop, string instruction_name> {
+  foreach vti = AllFloatVectors in {
+    let Predicates = GetVTypePredicates<vti>.Predicates in {
+      def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rs3),
+                                 (vti.Vector vti.RegClass:$rs1),
+                                 vti.Scalar:$rs2, (vti.Mask V0), VLOpFrag)),
+                (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_MASK")
+                    vti.RegClass:$rs3, vti.RegClass:$rs1, vti.Scalar:$rs2,
+                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Patterns.
 //===----------------------------------------------------------------------===//
@@ -2937,70 +2987,12 @@ foreach vti = AllIntegerVectors in {
               (!cast<Instruction>("PseudoVID_V_"#vti.LMul.MX#"_MASK")
                   (vti.Vector (IMPLICIT_DEF)), (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                   TAIL_AGNOSTIC)>;
-    def : Pat<(vti.Vector (riscv_slide1up_vl (vti.Vector vti.RegClass:$rd),
-                                             (vti.Vector vti.RegClass:$rs1),
-                                             GPR:$rs2, (vti.Mask true_mask),
-                                             VLOpFrag)),
-              (!cast<Instruction>("PseudoVSLIDE1UP_VX_"#vti.LMul.MX)
-                  vti.RegClass:$rd, vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(vti.Vector (riscv_slide1down_vl (vti.Vector vti.RegClass:$rd),
-                                               (vti.Vector vti.RegClass:$rs1),
-                                               GPR:$rs2, (vti.Mask true_mask),
-                                               VLOpFrag)),
-              (!cast<Instruction>("PseudoVSLIDE1DOWN_VX_"#vti.LMul.MX)
-                  vti.RegClass:$rd, vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
   }
 }
 
-foreach vti = AllFloatVectors in {
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
-  def : Pat<(vti.Vector (riscv_fslide1up_vl (vti.Vector vti.RegClass:$rd),
-                                            (vti.Vector vti.RegClass:$rs1),
-                                            vti.Scalar:$rs2, (vti.Mask true_mask),
-                                            VLOpFrag)),
-            (!cast<Instruction>("PseudoVFSLIDE1UP_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
-                vti.RegClass:$rd, vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
-  def : Pat<(vti.Vector (riscv_fslide1down_vl (vti.Vector vti.RegClass:$rd),
-                                              (vti.Vector vti.RegClass:$rs1),
-                                              vti.Scalar:$rs2, (vti.Mask true_mask),
-                                              VLOpFrag)),
-            (!cast<Instruction>("PseudoVFSLIDE1DOWN_V"#vti.ScalarSuffix#"_"#vti.LMul.MX)
-                vti.RegClass:$rd, vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
-  }
-}
-
-foreach vti = AllVectors in {
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_slideup_vl (vti.Vector vti.RegClass:$rs3),
-                                            (vti.Vector vti.RegClass:$rs1),
-                                            uimm5:$rs2, (vti.Mask true_mask),
-                                            VLOpFrag, (XLenVT timm:$policy))),
-              (!cast<Instruction>("PseudoVSLIDEUP_VI_"#vti.LMul.MX)
-                  vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
-                  GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
-
-    def : Pat<(vti.Vector (riscv_slideup_vl (vti.Vector vti.RegClass:$rs3),
-                                            (vti.Vector vti.RegClass:$rs1),
-                                            GPR:$rs2, (vti.Mask true_mask),
-                                            VLOpFrag, (XLenVT timm:$policy))),
-              (!cast<Instruction>("PseudoVSLIDEUP_VX_"#vti.LMul.MX)
-                  vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
-                  GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
-
-    def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector vti.RegClass:$rs3),
-                                              (vti.Vector vti.RegClass:$rs1),
-                                              uimm5:$rs2, (vti.Mask true_mask),
-                                              VLOpFrag, (XLenVT timm:$policy))),
-              (!cast<Instruction>("PseudoVSLIDEDOWN_VI_"#vti.LMul.MX)
-                  vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
-                  GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
-
-    def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector vti.RegClass:$rs3),
-                                              (vti.Vector vti.RegClass:$rs1),
-                                              GPR:$rs2, (vti.Mask true_mask),
-                                              VLOpFrag, (XLenVT timm:$policy))),
-              (!cast<Instruction>("PseudoVSLIDEDOWN_VX_"#vti.LMul.MX)
-                  vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
-                  GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
-  }
-}
+defm : VPatSlideVL_VX_VI<riscv_slideup_vl, "PseudoVSLIDEUP">;
+defm : VPatSlideVL_VX_VI<riscv_slidedown_vl, "PseudoVSLIDEDOWN">;
+defm : VPatSlide1VL_VX<riscv_slide1up_vl, "PseudoVSLIDE1UP">;
+defm : VPatSlide1VL_VF<riscv_fslide1up_vl, "PseudoVFSLIDE1UP">;
+defm : VPatSlide1VL_VX<riscv_slide1down_vl, "PseudoVSLIDE1DOWN">;
+defm : VPatSlide1VL_VF<riscv_fslide1down_vl, "PseudoVFSLIDE1DOWN">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td
index 7c304dca9a253b..5d6e8821b85931 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file describes the RISC-V instructions from the standard 'Zfa'
-// additional floating-point extension, version 0.1.
-// This version is still experimental as the 'Zfa' extension hasn't been
-// ratified yet.
+// additional floating-point extension, version 1.0.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index b904323a3bfb58..272d7f64b4d995 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -410,9 +410,10 @@ let Predicates = [HasStdExtZvksh] in {
 // SDNode patterns
 //===----------------------------------------------------------------------===//
 
-multiclass VPatUnarySDNode_V<SDPatternOperator op, string instruction_name> {
+multiclass VPatUnarySDNode_V<SDPatternOperator op, string instruction_name,
+                             Predicate predicate = HasStdExtZvbb> {
   foreach vti = AllIntegerVectors in {
-    let Predicates = !listconcat([HasStdExtZvbb],
+    let Predicates = !listconcat([predicate],
                                  GetVTypePredicates<vti>.Predicates) in {
       def : Pat<(vti.Vector (op (vti.Vector vti.RegClass:$rs1))),
                 (!cast<Instruction>(instruction_name#"_V_"#vti.LMul.MX)
@@ -431,7 +432,7 @@ def riscv_vnot : PatFrag<(ops node:$rs1), (xor node:$rs1,
                                                (riscv_splat_vector -1))>;
 
 foreach vti = AllIntegerVectors in {
-  let Predicates = !listconcat([HasStdExtZvbb],
+  let Predicates = !listconcat([HasStdExtZvkb],
                                GetVTypePredicates<vti>.Predicates) in {
     def : Pat<(vti.Vector (and (riscv_vnot vti.RegClass:$rs1),
                                vti.RegClass:$rs2)),
@@ -452,7 +453,7 @@ foreach vti = AllIntegerVectors in {
 }
 
 defm : VPatUnarySDNode_V<bitreverse, "PseudoVBREV">;
-defm : VPatUnarySDNode_V<bswap, "PseudoVREV8">;
+defm : VPatUnarySDNode_V<bswap, "PseudoVREV8", HasStdExtZvkb>;
 defm : VPatUnarySDNode_V<ctlz, "PseudoVCLZ">;
 defm : VPatUnarySDNode_V<cttz, "PseudoVCTZ">;
 defm : VPatUnarySDNode_V<ctpop, "PseudoVCPOP">;
@@ -480,7 +481,7 @@ def InvRot64Imm : SDNodeXForm<imm, [{
 // Although there is no vrol.vi, an immediate rotate left can be achieved by
 // negating the immediate in vror.vi
 foreach vti = AllIntegerVectors in {
-  let Predicates = !listconcat([HasStdExtZvbb],
+  let Predicates = !listconcat([HasStdExtZvkb],
                                GetVTypePredicates<vti>.Predicates) in {
     def : Pat<(vti.Vector (rotl vti.RegClass:$rs2,
                                 (vti.Vector (SplatPat_uimm6 uimm6:$rs1)))),
@@ -526,9 +527,10 @@ foreach vtiToWti = AllWidenableIntVectors in {
 // VL patterns
 //===----------------------------------------------------------------------===//
 
-multiclass VPatUnaryVL_V<SDPatternOperator op, string instruction_name> {
+multiclass VPatUnaryVL_V<SDPatternOperator op, string instruction_name,
+                         Predicate predicate = HasStdExtZvbb> {
   foreach vti = AllIntegerVectors in {
-    let Predicates = !listconcat([HasStdExtZvbb],
+    let Predicates = !listconcat([predicate],
                                  GetVTypePredicates<vti>.Predicates) in {
       def : Pat<(vti.Vector (op (vti.Vector vti.RegClass:$rs1),
                                 (vti.Vector vti.RegClass:$merge),
@@ -546,7 +548,7 @@ multiclass VPatUnaryVL_V<SDPatternOperator op, string instruction_name> {
 }
 
 foreach vti = AllIntegerVectors in {
-  let Predicates = !listconcat([HasStdExtZvbb],
+  let Predicates = !listconcat([HasStdExtZvkb],
                                GetVTypePredicates<vti>.Predicates) in {
     def : Pat<(vti.Vector (riscv_and_vl (riscv_xor_vl
                                            (vti.Vector vti.RegClass:$rs1),
@@ -585,7 +587,7 @@ foreach vti = AllIntegerVectors in {
 }
 
 defm : VPatUnaryVL_V<riscv_bitreverse_vl, "PseudoVBREV">;
-defm : VPatUnaryVL_V<riscv_bswap_vl, "PseudoVREV8">;
+defm : VPatUnaryVL_V<riscv_bswap_vl, "PseudoVREV8", HasStdExtZvkb>;
 defm : VPatUnaryVL_V<riscv_ctlz_vl, "PseudoVCLZ">;
 defm : VPatUnaryVL_V<riscv_cttz_vl, "PseudoVCTZ">;
 defm : VPatUnaryVL_V<riscv_ctpop_vl, "PseudoVCPOP">;
@@ -594,7 +596,7 @@ defm : VPatBinaryVL_VV_VX<riscv_rotl_vl, "PseudoVROL">;
 // Although there is no vrol.vi, an immediate rotate left can be achieved by
 // negating the immediate in vror.vi
 foreach vti = AllIntegerVectors in {
-  let Predicates = !listconcat([HasStdExtZvbb],
+  let Predicates = !listconcat([HasStdExtZvkb],
                                GetVTypePredicates<vti>.Predicates) in {
     def : Pat<(riscv_rotl_vl vti.RegClass:$rs2,
                              (vti.Vector (SplatPat_uimm6 uimm6:$rs1)),
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 7c3eb3bdf7ea4b..439a1bb6e1e69d 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -88,7 +88,7 @@ static bool vectorPseudoHasAllNBitUsers(const MachineOperand &UserOp,
     return false;
 
   const MCInstrDesc &MCID = MI.getDesc();
-  const uint64_t TSFlags = MI.getDesc().TSFlags;
+  const uint64_t TSFlags = MCID.TSFlags;
   if (!RISCVII::hasSEWOp(TSFlags))
     return false;
   assert(RISCVII::hasVLOp(TSFlags));
@@ -97,120 +97,9 @@ static bool vectorPseudoHasAllNBitUsers(const MachineOperand &UserOp,
   if (UserOp.getOperandNo() == RISCVII::getVLOpNum(MCID))
     return false;
 
-  // TODO: Handle Zvbb instructions
-  switch (PseudoInfo->BaseInstr) {
-  default:
-    return false;
-
-  // 11.6. Vector Single-Width Shift Instructions
-  case RISCV::VSLL_VX:
-  case RISCV::VSRL_VX:
-  case RISCV::VSRA_VX:
-  // 12.4. Vector Single-Width Scaling Shift Instructions
-  case RISCV::VSSRL_VX:
-  case RISCV::VSSRA_VX:
-    // Only the low lg2(SEW) bits of the shift-amount value are used.
-    if (Bits < Log2SEW)
-      return false;
-    break;
-
-  // 11.7 Vector Narrowing Integer Right Shift Instructions
-  case RISCV::VNSRL_WX:
-  case RISCV::VNSRA_WX:
-  // 12.5. Vector Narrowing Fixed-Point Clip Instructions
-  case RISCV::VNCLIPU_WX:
-  case RISCV::VNCLIP_WX:
-    // Only the low lg2(2*SEW) bits of the shift-amount value are used.
-    if (Bits < Log2SEW + 1)
-      return false;
-    break;
-
-  // 11.1. Vector Single-Width Integer Add and Subtract
-  case RISCV::VADD_VX:
-  case RISCV::VSUB_VX:
-  case RISCV::VRSUB_VX:
-  // 11.2. Vector Widening Integer Add/Subtract
-  case RISCV::VWADDU_VX:
-  case RISCV::VWSUBU_VX:
-  case RISCV::VWADD_VX:
-  case RISCV::VWSUB_VX:
-  case RISCV::VWADDU_WX:
-  case RISCV::VWSUBU_WX:
-  case RISCV::VWADD_WX:
-  case RISCV::VWSUB_WX:
-  // 11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
-  case RISCV::VADC_VXM:
-  case RISCV::VADC_VIM:
-  case RISCV::VMADC_VXM:
-  case RISCV::VMADC_VIM:
-  case RISCV::VMADC_VX:
-  case RISCV::VSBC_VXM:
-  case RISCV::VMSBC_VXM:
-  case RISCV::VMSBC_VX:
-  // 11.5 Vector Bitwise Logical Instructions
-  case RISCV::VAND_VX:
-  case RISCV::VOR_VX:
-  case RISCV::VXOR_VX:
-  // 11.8. Vector Integer Compare Instructions
-  case RISCV::VMSEQ_VX:
-  case RISCV::VMSNE_VX:
-  case RISCV::VMSLTU_VX:
-  case RISCV::VMSLT_VX:
-  case RISCV::VMSLEU_VX:
-  case RISCV::VMSLE_VX:
-  case RISCV::VMSGTU_VX:
-  case RISCV::VMSGT_VX:
-  // 11.9. Vector Integer Min/Max Instructions
-  case RISCV::VMINU_VX:
-  case RISCV::VMIN_VX:
-  case RISCV::VMAXU_VX:
-  case RISCV::VMAX_VX:
-  // 11.10. Vector Single-Width Integer Multiply Instructions
-  case RISCV::VMUL_VX:
-  case RISCV::VMULH_VX:
-  case RISCV::VMULHU_VX:
-  case RISCV::VMULHSU_VX:
-  // 11.11. Vector Integer Divide Instructions
-  case RISCV::VDIVU_VX:
-  case RISCV::VDIV_VX:
-  case RISCV::VREMU_VX:
-  case RISCV::VREM_VX:
-  // 11.12. Vector Widening Integer Multiply Instructions
-  case RISCV::VWMUL_VX:
-  case RISCV::VWMULU_VX:
-  case RISCV::VWMULSU_VX:
-  // 11.13. Vector Single-Width Integer Multiply-Add Instructions
-  case RISCV::VMACC_VX:
-  case RISCV::VNMSAC_VX:
-  case RISCV::VMADD_VX:
-  case RISCV::VNMSUB_VX:
-  // 11.14. Vector Widening Integer Multiply-Add Instructions
-  case RISCV::VWMACCU_VX:
-  case RISCV::VWMACC_VX:
-  case RISCV::VWMACCSU_VX:
-  case RISCV::VWMACCUS_VX:
-  // 11.15. Vector Integer Merge Instructions
-  case RISCV::VMERGE_VXM:
-  // 11.16. Vector Integer Move Instructions
-  case RISCV::VMV_V_X:
-  // 12.1. Vector Single-Width Saturating Add and Subtract
-  case RISCV::VSADDU_VX:
-  case RISCV::VSADD_VX:
-  case RISCV::VSSUBU_VX:
-  case RISCV::VSSUB_VX:
-  // 12.2. Vector Single-Width Averaging Add and Subtract
-  case RISCV::VAADDU_VX:
-  case RISCV::VAADD_VX:
-  case RISCV::VASUBU_VX:
-  case RISCV::VASUB_VX:
-  // 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
-  case RISCV::VSMUL_VX:
-  // 16.1. Integer Scalar Move Instructions
-  case RISCV::VMV_S_X:
-    if (Bits < (1U << Log2SEW))
-      return false;
-  }
-  return true;
+  auto NumDemandedBits =
+      RISCV::getVectorLowDemandedScalarBits(PseudoInfo->BaseInstr, Log2SEW);
+  return NumDemandedBits && Bits >= *NumDemandedBits;
 }
 
 // Checks if all users only demand the lower \p OrigBits of the original
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 2f510a215d80bf..e947d4d1e8acde 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -322,7 +322,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
           LT.second.getVectorElementType().getSizeInBits() ==
               Tp->getElementType()->getPrimitiveSizeInBits() &&
           LT.second.getVectorNumElements() <
-              cast<FixedVectorType>(Tp)->getNumElements()) {
+              cast<FixedVectorType>(Tp)->getNumElements() &&
+          divideCeil(Mask.size(),
+                     cast<FixedVectorType>(Tp)->getNumElements()) ==
+              static_cast<unsigned>(*LT.first.getValue())) {
         unsigned NumRegs = *LT.first.getValue();
         unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
         unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
@@ -498,7 +501,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
     InstructionCost Cost = MemCost;
     for (unsigned Index : Indices) {
       FixedVectorType *SubVecTy =
-          FixedVectorType::get(FVTy->getElementType(), VF);
+          FixedVectorType::get(FVTy->getElementType(), VF * Factor);
       auto Mask = createStrideMask(Index, Factor, VF);
       InstructionCost ShuffleCost =
           getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 11a59df899a1a0..75a37060a8d06a 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -1423,6 +1423,18 @@ void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized(
 
   // Setup stack frame offset
   MFFrame.setOffsetAdjustment(Regs.getStackPointerBias());
+
+  // Nothing to do for leaf functions.
+  uint64_t StackSize = MFFrame.estimateStackSize(MF);
+  if (StackSize == 0 && MFFrame.getCalleeSavedInfo().empty())
+    return;
+
+  // Although the XPLINK specifications for AMODE64 state that minimum size
+  // of the param area is minimum 32 bytes and no rounding is otherwise
+  // specified, we round this area in 64 bytes increments to be compatible
+  // with existing compilers.
+  MFFrame.setMaxCallFrameSize(
+      std::max(64U, (unsigned)alignTo(MFFrame.getMaxCallFrameSize(), 64)));
 }
 
 // Determines the size of the frame, and creates the deferred spill objects.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 886b5e93bffc0d..5c5cb964bd28db 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -691,6 +691,19 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
 
   // Default to having -disable-strictnode-mutation on
   IsStrictFPEnabled = true;
+
+  if (Subtarget.isTargetzOS()) {
+    struct RTLibCallMapping {
+      RTLIB::Libcall Code;
+      const char *Name;
+    };
+    static RTLibCallMapping RTLibCallCommon[] = {
+#define HANDLE_LIBCALL(code, name) {RTLIB::code, name},
+#include "ZOSLibcallNames.def"
+    };
+    for (auto &E : RTLibCallCommon)
+      setLibcallName(E.Code, E.Name);
+  }
 }
 
 bool SystemZTargetLowering::useSoftFloat() const {
@@ -1803,13 +1816,6 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = ArgCCInfo.getStackSize();
 
-  if (Subtarget.isTargetXPLINK64())
-    // Although the XPLINK specifications for AMODE64 state that minimum size
-    // of the param area is minimum 32 bytes and no rounding is otherwise
-    // specified, we round this area in 64 bytes increments to be compatible
-    // with existing compilers.
-    NumBytes = std::max(64U, (unsigned)alignTo(NumBytes, 64));
-
   // Mark the start of the call.
   if (!IsTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
diff --git a/llvm/lib/Target/SystemZ/ZOSLibcallNames.def b/llvm/lib/Target/SystemZ/ZOSLibcallNames.def
new file mode 100644
index 00000000000000..12a01522a7e643
--- /dev/null
+++ b/llvm/lib/Target/SystemZ/ZOSLibcallNames.def
@@ -0,0 +1,100 @@
+//===-- ZOSLibcallNames.def ----------------------------------- -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the runtime library functions on z/OS which can be
+// generated during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(HANDLE_LIBCALL)
+#error "HANDLE_LIBCALL must be defined"
+#endif
+
+HANDLE_LIBCALL(TRUNC_F64, "@@TRNC@B")
+HANDLE_LIBCALL(TRUNC_F32, "@@FTRC@B")
+HANDLE_LIBCALL(TRUNC_F128, "@@LTRC@B")
+HANDLE_LIBCALL(SQRT_F64, "@@WSQT@B")
+HANDLE_LIBCALL(SQRT_F32, "@@FSQT@B")
+HANDLE_LIBCALL(SQRT_F128, "@@LSQT@B")
+HANDLE_LIBCALL(SIN_F64, "@@SSIN@B")
+HANDLE_LIBCALL(SIN_F32, "@@FSIN@B")
+HANDLE_LIBCALL(SIN_F128, "@@LSIN@B")
+HANDLE_LIBCALL(ROUND_F64, "@@ROUN@B")
+HANDLE_LIBCALL(ROUND_F32, "@@ROUNFB")
+HANDLE_LIBCALL(ROUND_F128, "@@ROUNLB")
+HANDLE_LIBCALL(RINT_F64, "@@SRNT@B")
+HANDLE_LIBCALL(RINT_F32, "@@RINTFB")
+HANDLE_LIBCALL(RINT_F128, "@@RINTLB")
+HANDLE_LIBCALL(REM_F64, "@@WFMD@B")
+HANDLE_LIBCALL(REM_F32, "@@FFMD@B")
+HANDLE_LIBCALL(REM_F128, "@@LFMD@B")
+HANDLE_LIBCALL(POW_F64, "@@WPOW@B")
+HANDLE_LIBCALL(POW_F32, "@@FPOW@B")
+HANDLE_LIBCALL(POW_F128, "@@LPOW@B")
+HANDLE_LIBCALL(NEARBYINT_F64, "@@NBYI@B")
+HANDLE_LIBCALL(NEARBYINT_F32, "@@NBYIFB")
+HANDLE_LIBCALL(NEARBYINT_F128, "@@NBYILB")
+HANDLE_LIBCALL(LROUND_F64, "@@ROND@B")
+HANDLE_LIBCALL(LROUND_F32, "@@FRND@B")
+HANDLE_LIBCALL(LROUND_F128, "@@LRND@B")
+HANDLE_LIBCALL(LRINT_F64, "@@LRNT@B")
+HANDLE_LIBCALL(LRINT_F32, "@@LRNTFB")
+HANDLE_LIBCALL(LRINT_F128, "@@LRNTLB")
+HANDLE_LIBCALL(LOG_F64, "@@WLOG@B")
+HANDLE_LIBCALL(LOG_F32, "@@FLOG@B")
+HANDLE_LIBCALL(LOG_F128, "@@LLOG@B")
+HANDLE_LIBCALL(LOG2_F64, "@@LOG2@B")
+HANDLE_LIBCALL(LOG2_F32, "@@FLG2@B")
+HANDLE_LIBCALL(LOG2_F128, "@@LLG2@B")
+HANDLE_LIBCALL(LOG10_F64, "@@WLG1@B")
+HANDLE_LIBCALL(LOG10_F32, "@@FLG1@B")
+HANDLE_LIBCALL(LOG10_F128, "@@LLG1@B")
+HANDLE_LIBCALL(LLROUND_F64, "@@LLRD@B")
+HANDLE_LIBCALL(LLROUND_F32, "@@LLRDFB")
+HANDLE_LIBCALL(LLROUND_F128, "@@LLRDLB")
+HANDLE_LIBCALL(LLRINT_F64, "@@LLRT@B")
+HANDLE_LIBCALL(LLRINT_F32, "@@LLRTFB")
+HANDLE_LIBCALL(LLRINT_F128, "@@LLRTLB")
+HANDLE_LIBCALL(LDEXP_F64, "@@SLXP@B")
+HANDLE_LIBCALL(LDEXP_F32, "@@FLXP@B")
+HANDLE_LIBCALL(LDEXP_F128, "@@LLXP@B")
+HANDLE_LIBCALL(FREXP_F64, "@@SFXP@B")
+HANDLE_LIBCALL(FREXP_F32, "@@FFXP@B")
+HANDLE_LIBCALL(FREXP_F128, "@@LFXP@B")
+HANDLE_LIBCALL(FMIN_F64, "@@FMIN@B")
+HANDLE_LIBCALL(FMIN_F32, "@@FMINFB")
+HANDLE_LIBCALL(FMIN_F128, "@@FMINLB")
+HANDLE_LIBCALL(FMA_F64, "@@FMA@B")
+HANDLE_LIBCALL(FMA_F32, "@@FMAFB")
+HANDLE_LIBCALL(FMA_F128, "@@FMALB")
+HANDLE_LIBCALL(FMAX_F64, "@@FMAX@B")
+HANDLE_LIBCALL(FMAX_F32, "@@FMAXFB")
+HANDLE_LIBCALL(FMAX_F128, "@@FMAXLB")
+HANDLE_LIBCALL(FLOOR_F64, "@@SFLR@B")
+HANDLE_LIBCALL(FLOOR_F32, "@@FFLR@B")
+HANDLE_LIBCALL(FLOOR_F128, "@@LFLR@B")
+HANDLE_LIBCALL(EXP_F64, "@@WEXP@B")
+HANDLE_LIBCALL(EXP_F32, "@@FEXP@B")
+HANDLE_LIBCALL(EXP_F128, "@@LEXP@B")
+HANDLE_LIBCALL(EXP2_F64, "@@EXP2@B")
+HANDLE_LIBCALL(EXP2_F32, "@@FXP2@B")
+HANDLE_LIBCALL(EXP2_F128, "@@LXP2@B")
+HANDLE_LIBCALL(COS_F64, "@@SCOS@B")
+HANDLE_LIBCALL(COS_F32, "@@FCOS@B")
+HANDLE_LIBCALL(COS_F128, "@@LCOS@B")
+HANDLE_LIBCALL(COPYSIGN_F64, "@@DCPY@B")
+HANDLE_LIBCALL(COPYSIGN_F32, "@@FCPY@B")
+HANDLE_LIBCALL(COPYSIGN_F128, "@@LCPY@B")
+HANDLE_LIBCALL(CEIL_F64, "@@SCEL@B")
+HANDLE_LIBCALL(CEIL_F32, "@@FCEL@B")
+HANDLE_LIBCALL(CEIL_F128, "@@LCEL@B")
+HANDLE_LIBCALL(CBRT_F64, "@@SCRT@B")
+HANDLE_LIBCALL(CBRT_F32, "@@FCBT@B")
+HANDLE_LIBCALL(CBRT_F128, "@@LCBT@B")
+
+#undef HANDLE_LIBCALL
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 4a329bef468199..86fb99cc98a905 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -71,8 +71,7 @@ void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
   // that is a reference type.
   wasm::ValType ValTy;
   bool IsTable = false;
-  if (GlobalVT->isArrayTy() && WebAssembly::isWebAssemblyReferenceType(
-                                   GlobalVT->getArrayElementType())) {
+  if (WebAssembly::isWebAssemblyTableType(GlobalVT)) {
     IsTable = true;
     const Type *ElTy = GlobalVT->getArrayElementType();
     if (WebAssembly::isWebAssemblyExternrefType(ElTy))
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
index d59fcb3c1b5967..a8860477a2472c 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
@@ -28,14 +28,16 @@ namespace WebAssembly {
 
 /// Return true if this is a WebAssembly Externref Type.
 inline bool isWebAssemblyExternrefType(const Type *Ty) {
-  return Ty->getPointerAddressSpace() ==
-         WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF;
+  return Ty->isPointerTy() &&
+         Ty->getPointerAddressSpace() ==
+             WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF;
 }
 
 /// Return true if this is a WebAssembly Funcref Type.
 inline bool isWebAssemblyFuncrefType(const Type *Ty) {
-  return Ty->getPointerAddressSpace() ==
-         WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF;
+  return Ty->isPointerTy() &&
+         Ty->getPointerAddressSpace() ==
+             WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF;
 }
 
 /// Return true if this is a WebAssembly Reference Type.
@@ -43,6 +45,12 @@ inline bool isWebAssemblyReferenceType(const Type *Ty) {
   return isWebAssemblyExternrefType(Ty) || isWebAssemblyFuncrefType(Ty);
 }
 
+/// Return true if the table represents a WebAssembly table type.
+inline bool isWebAssemblyTableType(const Type *Ty) {
+  return Ty->isArrayTy() &&
+         isWebAssemblyReferenceType(Ty->getArrayElementType());
+}
+
 // Convert StringRef to ValType / HealType / BlockType
 
 MVT parseMVT(StringRef Type);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 131e99c66fa2e5..d8cbddf74545da 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -667,7 +667,7 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
 
   // When there is an unconditional branch right before a catch instruction and
   // it branches to the end of end_try marker, we don't need the branch, because
-  // it there is no exception, the control flow transfers to that point anyway.
+  // if there is no exception, the control flow transfers to that point anyway.
   // bb0:
   //   try
   //     ...
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index e60f1397b99335..8f3ad167ae41fc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -130,7 +130,15 @@ bool WebAssemblyFrameLowering::hasReservedCallFrame(
 bool WebAssemblyFrameLowering::needsSPForLocalFrame(
     const MachineFunction &MF) const {
   auto &MFI = MF.getFrameInfo();
-  return MFI.getStackSize() || MFI.adjustsStack() || hasFP(MF);
+  auto &MRI = MF.getRegInfo();
+  // llvm.stacksave can explicitly read SP register and it can appear without
+  // dynamic alloca.
+  bool HasExplicitSPUse =
+      any_of(MRI.use_operands(getSPReg(MF)),
+             [](MachineOperand &MO) { return !MO.isImplicit(); });
+
+  return MFI.getStackSize() || MFI.adjustsStack() || hasFP(MF) ||
+         HasExplicitSPUse;
 }
 
 // In function with EH pads, we need to make a copy of the value of
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index fd154a90edef1d..61cfcdc914cdb9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1724,8 +1724,11 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
     fail(DL, DAG, "Invalid address space for WebAssembly target");
 
   unsigned OperandFlags = 0;
-  if (isPositionIndependent()) {
-    const GlobalValue *GV = GA->getGlobal();
+  const GlobalValue *GV = GA->getGlobal();
+  // Since WebAssembly tables cannot yet be shared accross modules, we don't
+  // need special treatment for tables in PIC mode.
+  if (isPositionIndependent() &&
+      !WebAssembly::isWebAssemblyTableType(GV->getValueType())) {
     if (getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) {
       MachineFunction &MF = DAG.getMachineFunction();
       MVT PtrVT = getPointerTy(MF.getDataLayout());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 10464e8f52e561..88c727796e0dfd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -127,6 +127,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
   // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
   // 'unreachable' instructions which is meant for that case.
   this->Options.TrapUnreachable = true;
+  this->Options.NoTrapAfterNoreturn = false;
 
   // WebAssembly treats each function as an independent unit. Force
   // -ffunction-sections, effectively, so that we can emit them independently.
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index f60b7e74ce6e81..6dc8cd4b2b3e92 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -2391,7 +2391,7 @@ bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
     return false;
 
   // TODO: We could sign extend narrower types.
-  MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
   if (SrcVT != MVT::i32 && SrcVT != MVT::i64)
     return false;
 
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index f4cf9ddfe82953..694a2df195c092 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2296,6 +2296,28 @@ SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
         uint64_t Offset = (uint64_t)AddVal->getZExtValue();
         if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
           SDLoc DL(N);
+          SDValue Res;
+          // If we're also scaling, see if we can use that as well.
+          if (AddSrc.getOpcode() == ISD::SHL &&
+              isa<ConstantSDNode>(AddSrc.getOperand(1))) {
+            SDValue ShVal = AddSrc.getOperand(0);
+            uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
+            APInt HiBits =
+                APInt::getHighBitsSet(AddSrc.getScalarValueSizeInBits(), ShAmt);
+            uint64_t ScaleAmt = 1ULL << ShAmt;
+            if ((AM.Scale * ScaleAmt) <= 8 &&
+                (AddSrc->getFlags().hasNoUnsignedWrap() ||
+                 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
+              AM.Scale *= ScaleAmt;
+              SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
+              SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
+                                                 AddSrc.getOperand(1));
+              insertDAGNode(*CurDAG, N, ExtShVal);
+              insertDAGNode(*CurDAG, N, ExtShift);
+              AddSrc = ExtShift;
+              Res = ExtShVal;
+            }
+          }
           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
           SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
           SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
@@ -2304,7 +2326,7 @@ SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
           insertDAGNode(*CurDAG, N, ExtAdd);
           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
           CurDAG->RemoveDeadNode(N.getNode());
-          return ExtSrc;
+          return Res ? Res : ExtSrc;
         }
       }
     }
@@ -2589,8 +2611,18 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
       break;
 
-    // Peek through mask: zext(and(shl(x,c1),c2))
     SDValue Src = N.getOperand(0);
+
+    // See if we can match a zext(addlike(x,c)).
+    // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
+    if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
+      if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
+        if (Index != N) {
+          AM.IndexReg = Index;
+          return false;
+        }
+
+    // Peek through mask: zext(and(shl(x,c1),c2))
     APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
     if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
       if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
@@ -2613,7 +2645,8 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       // That makes it safe to widen to the destination type.
       APInt HighZeros =
           APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
-      if (!CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
+      if (!Src->getFlags().hasNoUnsignedWrap() &&
+          !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
         break;
 
       // zext (shl nuw i8 %x, C1) to i32
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 08b9098871b354..c4cd2a672fe7b2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52207,9 +52207,13 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
   if (Add.getOpcode() != ISD::ADD)
     return SDValue();
 
+  SDValue AddOp0 = Add.getOperand(0);
+  SDValue AddOp1 = Add.getOperand(1);
   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
   bool NSW = Add->getFlags().hasNoSignedWrap();
   bool NUW = Add->getFlags().hasNoUnsignedWrap();
+  NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
+  NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
 
   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
   // into the 'zext'
@@ -52219,8 +52223,8 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
   // Having a constant operand to the 'add' ensures that we are not increasing
   // the instruction count because the constant is extended for free below.
   // A constant operand can also become the displacement field of an LEA.
-  auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
-  if (!AddOp1)
+  auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
+  if (!AddOp1C)
     return SDValue();
 
   // Don't make the 'add' bigger if there's no hope of combining it with some
@@ -52239,10 +52243,9 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
     return SDValue();
 
   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
-  int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
-  SDValue AddOp0 = Add.getOperand(0);
+  int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
-  SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+  SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
 
   // The wider add is guaranteed to not wrap because both operands are
   // sign-extended.
@@ -54448,7 +54451,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
   // but it currently struggles with different vector widths.
   if (llvm::all_of(Ops, [Op0](SDValue Op) {
-        return Op.getOpcode() == Op0.getOpcode();
+        return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
       })) {
     auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
       SmallVector<SDValue> Subs;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d838d1f96e310d..884f22b006bcba 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1481,6 +1481,10 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   if (Kind == TTI::SK_Broadcast)
     LT.first = 1;
 
+  // Treat <X x bfloat> shuffles as <X x half>.
+  if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
+    LT.second = LT.second.changeVectorElementType(MVT::f16);
+
   // Subvector extractions are free if they start at the beginning of a
   // vector and cheap if the subvectors are aligned.
   if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
@@ -1596,7 +1600,6 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
             BaseTp->getElementType()->getPrimitiveSizeInBits() &&
         LegalVT.getVectorNumElements() <
             cast<FixedVectorType>(BaseTp)->getNumElements()) {
-
       unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
       unsigned LegalVTSize = LegalVT.getStoreSize();
       // Number of source vectors after legalization:
@@ -1621,6 +1624,10 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         // copy of the previous destination register (the cost is
         // TTI::TCC_Basic). If the source register is just reused, the cost for
         // this operation is 0.
+        NumOfDests =
+            getTypeLegalizationCost(
+                FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
+                .first;
         unsigned E = *NumOfDests.getValue();
         unsigned NormalizedVF =
             LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
@@ -1635,7 +1642,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
             NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
             [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
              &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
-              if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
+              if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
                 // Check if the previous register can be just copied to the next
                 // one.
                 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 0f75de1ba6032b..fdc39c7fa478a6 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1292,10 +1292,7 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
   std::optional<FieldIDType> SwitchIndexFieldId;
 
   if (Shape.ABI == coro::ABI::Switch) {
-    auto *FramePtrTy = FrameTy->getPointerTo();
-    auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
-                                   /*IsVarArg=*/false);
-    auto *FnPtrTy = FnTy->getPointerTo();
+    auto *FnPtrTy = PointerType::getUnqual(C);
 
     // Add header fields for the resume and destroy functions.
     // We can rely on these being perfectly packed.
@@ -2487,7 +2484,7 @@ static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy,
                                      coro::Shape &Shape) {
   // Make a fake function pointer as a sort of intrinsic.
   auto FnTy = FunctionType::get(ValueTy, {}, false);
-  auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
+  auto Fn = ConstantPointerNull::get(Builder.getPtrTy());
 
   auto Call = Builder.CreateCall(FnTy, Fn, {});
   Shape.SwiftErrorOps.push_back(Call);
@@ -2501,9 +2498,9 @@ static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy,
 static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V,
                                      coro::Shape &Shape) {
   // Make a fake function pointer as a sort of intrinsic.
-  auto FnTy = FunctionType::get(V->getType()->getPointerTo(),
+  auto FnTy = FunctionType::get(Builder.getPtrTy(),
                                 {V->getType()}, false);
-  auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
+  auto Fn = ConstantPointerNull::get(Builder.getPtrTy());
 
   auto Call = Builder.CreateCall(FnTy, Fn, { V });
   Shape.SwiftErrorOps.push_back(Call);
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 98a989591f1c63..3808eef7ab4e27 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -10353,8 +10353,22 @@ struct AANoFPClassImpl : AANoFPClass {
                             /*Depth=*/0, TLI, AC, I, DT);
     State.addKnownBits(~KnownFPClass.KnownFPClasses);
 
-    bool TrackUse = false;
-    return TrackUse;
+    if (auto *CI = dyn_cast<CallInst>(UseV)) {
+      // Special case FP intrinsic with struct return type.
+      switch (CI->getIntrinsicID()) {
+      case Intrinsic::frexp:
+        return true;
+      case Intrinsic::not_intrinsic:
+        // TODO: Could recognize math libcalls
+        return false;
+      default:
+        break;
+      }
+    }
+
+    if (!UseV->getType()->isFPOrFPVectorTy())
+      return false;
+    return !isa<LoadInst, AtomicRMWInst>(UseV);
   }
 
   const std::string getAsStr(Attributor *A) const override {
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index aa63e2b64d9eab..429de758c7c3fd 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -202,7 +202,7 @@ Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C)
   CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize);
 
   uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() /
-                    BFI.getEntryFreq();
+                    BFI.getEntryFreq().getFrequency();
 
   Cost Latency = Weight *
       TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency);
diff --git a/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
index d46f9a6c675757..f6f8956760848a 100644
--- a/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
+++ b/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
@@ -111,7 +111,7 @@ PreservedAnalyses SyntheticCountsPropagation::run(Module &M,
     // Now compute the callsite count from relative frequency and
     // entry count:
     BasicBlock *CSBB = CB.getParent();
-    Scaled64 EntryFreq(BFI.getEntryFreq(), 0);
+    Scaled64 EntryFreq(BFI.getEntryFreq().getFrequency(), 0);
     Scaled64 BBCount(BFI.getBlockFreq(CSBB).getFrequency(), 0);
     BBCount /= EntryFreq;
     BBCount *= Counts[Caller];
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 951492cc150f30..20c13de33f8189 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1445,9 +1445,11 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
   // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
   Constant *BA = nullptr, *CA = nullptr;
   if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_Constant(BA)),
-                        m_Constant(CA))) &&
+                        m_ImmConstant(CA))) &&
       BA->isElementWiseEqual(CA) && A->getType() == DestTy) {
-    Constant *WideCurrShAmt = ConstantExpr::getSExt(CA, DestTy);
+    Constant *WideCurrShAmt =
+        ConstantFoldCastOperand(Instruction::SExt, CA, DestTy, DL);
+    assert(WideCurrShAmt && "Constant folding of ImmConstant cannot fail");
     Constant *NumLowbitsLeft = ConstantExpr::getSub(
         ConstantInt::get(DestTy, SrcTy->getScalarSizeInBits()), WideCurrShAmt);
     Constant *NewShAmt = ConstantExpr::getSub(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d411d08a36113b..95b506f0e35faf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include <bitset>
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -2895,19 +2896,89 @@ Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
   return new ICmpInst(SwappedPred, Add, ConstantInt::get(Ty, ~C));
 }
 
+static Value *createLogicFromTable(const std::bitset<4> &Table, Value *Op0,
+                                   Value *Op1, IRBuilderBase &Builder,
+                                   bool HasOneUse) {
+  switch (Table.to_ulong()) {
+  case 0: // 0 0 0 0
+    return Builder.getFalse();
+  case 1: // 0 0 0 1
+    return HasOneUse ? Builder.CreateNot(Builder.CreateOr(Op0, Op1)) : nullptr;
+  case 2: // 0 0 1 0
+    return HasOneUse ? Builder.CreateAnd(Builder.CreateNot(Op0), Op1) : nullptr;
+  case 3: // 0 0 1 1
+    return Builder.CreateNot(Op0);
+  case 4: // 0 1 0 0
+    return HasOneUse ? Builder.CreateAnd(Op0, Builder.CreateNot(Op1)) : nullptr;
+  case 5: // 0 1 0 1
+    return Builder.CreateNot(Op1);
+  case 6: // 0 1 1 0
+    return Builder.CreateXor(Op0, Op1);
+  case 7: // 0 1 1 1
+    return HasOneUse ? Builder.CreateNot(Builder.CreateAnd(Op0, Op1)) : nullptr;
+  case 8: // 1 0 0 0
+    return Builder.CreateAnd(Op0, Op1);
+  case 9: // 1 0 0 1
+    return HasOneUse ? Builder.CreateNot(Builder.CreateXor(Op0, Op1)) : nullptr;
+  case 10: // 1 0 1 0
+    return Op1;
+  case 11: // 1 0 1 1
+    return HasOneUse ? Builder.CreateOr(Builder.CreateNot(Op0), Op1) : nullptr;
+  case 12: // 1 1 0 0
+    return Op0;
+  case 13: // 1 1 0 1
+    return HasOneUse ? Builder.CreateOr(Op0, Builder.CreateNot(Op1)) : nullptr;
+  case 14: // 1 1 1 0
+    return Builder.CreateOr(Op0, Op1);
+  case 15: // 1 1 1 1
+    return Builder.getTrue();
+  default:
+    llvm_unreachable("Invalid Operation");
+  }
+  return nullptr;
+}
+
 /// Fold icmp (add X, Y), C.
 Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
                                                    BinaryOperator *Add,
                                                    const APInt &C) {
   Value *Y = Add->getOperand(1);
+  Value *X = Add->getOperand(0);
+
+  Value *Op0, *Op1;
+  Instruction *Ext0, *Ext1;
+  const CmpInst::Predicate Pred = Cmp.getPredicate();
+  if (match(Add,
+            m_Add(m_CombineAnd(m_Instruction(Ext0), m_ZExtOrSExt(m_Value(Op0))),
+                  m_CombineAnd(m_Instruction(Ext1),
+                               m_ZExtOrSExt(m_Value(Op1))))) &&
+      Op0->getType()->isIntOrIntVectorTy(1) &&
+      Op1->getType()->isIntOrIntVectorTy(1)) {
+    unsigned BW = C.getBitWidth();
+    std::bitset<4> Table;
+    auto ComputeTable = [&](bool Op0Val, bool Op1Val) {
+      int Res = 0;
+      if (Op0Val)
+        Res += isa<ZExtInst>(Ext0) ? 1 : -1;
+      if (Op1Val)
+        Res += isa<ZExtInst>(Ext1) ? 1 : -1;
+      return ICmpInst::compare(APInt(BW, Res, true), C, Pred);
+    };
+
+    Table[0] = ComputeTable(false, false);
+    Table[1] = ComputeTable(false, true);
+    Table[2] = ComputeTable(true, false);
+    Table[3] = ComputeTable(true, true);
+    if (auto *Cond =
+            createLogicFromTable(Table, Op0, Op1, Builder, Add->hasOneUse()))
+      return replaceInstUsesWith(Cmp, Cond);
+  }
   const APInt *C2;
   if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
     return nullptr;
 
   // Fold icmp pred (add X, C2), C.
-  Value *X = Add->getOperand(0);
   Type *Ty = Add->getType();
-  const CmpInst::Predicate Pred = Cmp.getPredicate();
 
   // If the add does not wrap, we can always adjust the compare by subtracting
   // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
@@ -6477,59 +6548,6 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) {
     }
   }
 
-  if (match(I.getOperand(0), m_c_Add(m_ZExt(m_Value(X)), m_SExt(m_Value(Y)))) &&
-      match(I.getOperand(1), m_APInt(C)) &&
-      X->getType()->isIntOrIntVectorTy(1) &&
-      Y->getType()->isIntOrIntVectorTy(1)) {
-    unsigned BitWidth = C->getBitWidth();
-    Pred = I.getPredicate();
-    APInt Zero = APInt::getZero(BitWidth);
-    APInt MinusOne = APInt::getAllOnes(BitWidth);
-    APInt One(BitWidth, 1);
-    if ((C->sgt(Zero) && Pred == ICmpInst::ICMP_SGT) ||
-        (C->slt(Zero) && Pred == ICmpInst::ICMP_SLT))
-      return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if ((C->sgt(One) && Pred == ICmpInst::ICMP_SLT) ||
-        (C->slt(MinusOne) && Pred == ICmpInst::ICMP_SGT))
-      return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
-
-    if (I.getOperand(0)->hasOneUse()) {
-      APInt NewC = *C;
-      // canonicalize predicate to eq/ne
-      if ((*C == Zero && Pred == ICmpInst::ICMP_SLT) ||
-          (*C != Zero && *C != MinusOne && Pred == ICmpInst::ICMP_UGT)) {
-        // x s< 0 in [-1, 1] --> x == -1
-        // x u> 1(or any const !=0 !=-1) in [-1, 1] --> x == -1
-        NewC = MinusOne;
-        Pred = ICmpInst::ICMP_EQ;
-      } else if ((*C == MinusOne && Pred == ICmpInst::ICMP_SGT) ||
-                 (*C != Zero && *C != One && Pred == ICmpInst::ICMP_ULT)) {
-        // x s> -1 in [-1, 1] --> x != -1
-        // x u< -1 in [-1, 1] --> x != -1
-        Pred = ICmpInst::ICMP_NE;
-      } else if (*C == Zero && Pred == ICmpInst::ICMP_SGT) {
-        // x s> 0 in [-1, 1] --> x == 1
-        NewC = One;
-        Pred = ICmpInst::ICMP_EQ;
-      } else if (*C == One && Pred == ICmpInst::ICMP_SLT) {
-        // x s< 1 in [-1, 1] --> x != 1
-        Pred = ICmpInst::ICMP_NE;
-      }
-
-      if (NewC == MinusOne) {
-        if (Pred == ICmpInst::ICMP_EQ)
-          return BinaryOperator::CreateAnd(Builder.CreateNot(X), Y);
-        if (Pred == ICmpInst::ICMP_NE)
-          return BinaryOperator::CreateOr(X, Builder.CreateNot(Y));
-      } else if (NewC == One) {
-        if (Pred == ICmpInst::ICMP_EQ)
-          return BinaryOperator::CreateAnd(X, Builder.CreateNot(Y));
-        if (Pred == ICmpInst::ICMP_NE)
-          return BinaryOperator::CreateOr(Builder.CreateNot(X), Y);
-      }
-    }
-  }
-
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index c90925fd6edd2f..7337cd7caec99c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -548,6 +548,15 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                     APInt &UndefElts, unsigned Depth = 0,
                                     bool AllowMultipleUsers = false) override;
 
+  /// Attempts to replace V with a simpler value based on the demanded
+  /// floating-point classes
+  Value *SimplifyDemandedUseFPClass(Value *V, FPClassTest DemandedMask,
+                                    KnownFPClass &Known, unsigned Depth,
+                                    Instruction *CxtI);
+  bool SimplifyDemandedFPClass(Instruction *I, unsigned Op,
+                               FPClassTest DemandedMask, KnownFPClass &Known,
+                               unsigned Depth = 0);
+
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
   Instruction *foldVectorSelect(SelectInst &Sel);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 5b8b74c8648b8c..b775a154e0e6b6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -996,8 +996,8 @@ static bool isDeadPHICycle(PHINode *PN,
 /// Return true if this phi node is always equal to NonPhiInVal.
 /// This happens with mutually cyclic phi nodes like:
 ///   z = some value; x = phi (y, z); y = phi (x, z)
-static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
-                           SmallPtrSetImpl<PHINode*> &ValueEqualPHIs) {
+static bool PHIsEqualValue(PHINode *PN, Value *&NonPhiInVal,
+                           SmallPtrSetImpl<PHINode *> &ValueEqualPHIs) {
   // See if we already saw this PHI node.
   if (!ValueEqualPHIs.insert(PN).second)
     return true;
@@ -1010,8 +1010,11 @@ static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
   // the value.
   for (Value *Op : PN->incoming_values()) {
     if (PHINode *OpPN = dyn_cast<PHINode>(Op)) {
-      if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs))
-        return false;
+      if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs)) {
+        if (NonPhiInVal)
+          return false;
+        NonPhiInVal = OpPN;
+      }
     } else if (Op != NonPhiInVal)
       return false;
   }
@@ -1478,7 +1481,9 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
   //   z = some value; x = phi (y, z); y = phi (x, z)
   // where the phi nodes don't necessarily need to be in the same block.  Do a
   // quick check to see if the PHI node only contains a single non-phi value, if
-  // so, scan to see if the phi cycle is actually equal to that value.
+  // so, scan to see if the phi cycle is actually equal to that value. If the
+  // phi has no non-phi values then allow the "NonPhiInVal" to be set later if
+  // one of the phis itself does not have a single input.
   {
     unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues();
     // Scan for the first non-phi operand.
@@ -1486,25 +1491,25 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
            isa<PHINode>(PN.getIncomingValue(InValNo)))
       ++InValNo;
 
-    if (InValNo != NumIncomingVals) {
-      Value *NonPhiInVal = PN.getIncomingValue(InValNo);
+    Value *NonPhiInVal =
+        InValNo != NumIncomingVals ? PN.getIncomingValue(InValNo) : nullptr;
 
-      // Scan the rest of the operands to see if there are any conflicts, if so
-      // there is no need to recursively scan other phis.
+    // Scan the rest of the operands to see if there are any conflicts, if so
+    // there is no need to recursively scan other phis.
+    if (NonPhiInVal)
       for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
         Value *OpVal = PN.getIncomingValue(InValNo);
         if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
           break;
       }
 
-      // If we scanned over all operands, then we have one unique value plus
-      // phi values.  Scan PHI nodes to see if they all merge in each other or
-      // the value.
-      if (InValNo == NumIncomingVals) {
-        SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
-        if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
-          return replaceInstUsesWith(PN, NonPhiInVal);
-      }
+    // If we scanned over all operands, then we have one unique value plus
+    // phi values.  Scan PHI nodes to see if they all merge in each other or
+    // the value.
+    if (InValNo == NumIncomingVals) {
+      SmallPtrSet<PHINode *, 16> ValueEqualPHIs;
+      if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
+        return replaceInstUsesWith(PN, NonPhiInVal);
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index c42164da8aee1a..7a15c0dee492b5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1495,8 +1495,13 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
     if (!match(ReplacementLow, m_ImmConstant(LowC)) ||
         !match(ReplacementHigh, m_ImmConstant(HighC)))
       return nullptr;
-    ReplacementLow = ConstantExpr::getSExt(LowC, X->getType());
-    ReplacementHigh = ConstantExpr::getSExt(HighC, X->getType());
+    const DataLayout &DL = Sel0.getModule()->getDataLayout();
+    ReplacementLow =
+        ConstantFoldCastOperand(Instruction::SExt, LowC, X->getType(), DL);
+    ReplacementHigh =
+        ConstantFoldCastOperand(Instruction::SExt, HighC, X->getType(), DL);
+    assert(ReplacementLow && ReplacementHigh &&
+           "Constant folding of ImmConstant cannot fail");
   }
 
   // All good, finally emit the new pattern.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index be005e61a8d2d8..abdaf167d3d9df 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1781,3 +1781,139 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
 
   return MadeChange ? I : nullptr;
 }
+
+/// For floating-point classes that resolve to a single bit pattern, return that
+/// value.
+static Constant *getFPClassConstant(Type *Ty, FPClassTest Mask) {
+  switch (Mask) {
+  case fcPosZero:
+    return ConstantFP::getZero(Ty);
+  case fcNegZero:
+    return ConstantFP::getZero(Ty, true);
+  case fcPosInf:
+    return ConstantFP::getInfinity(Ty);
+  case fcNegInf:
+    return ConstantFP::getInfinity(Ty, true);
+  case fcNone:
+    return PoisonValue::get(Ty);
+  default:
+    return nullptr;
+  }
+}
+
+Value *InstCombinerImpl::SimplifyDemandedUseFPClass(
+    Value *V, const FPClassTest DemandedMask, KnownFPClass &Known,
+    unsigned Depth, Instruction *CxtI) {
+  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
+  Type *VTy = V->getType();
+
+  assert(Known == KnownFPClass() && "expected uninitialized state");
+
+  if (DemandedMask == fcNone)
+    return isa<UndefValue>(V) ? nullptr : PoisonValue::get(VTy);
+
+  if (Depth == MaxAnalysisRecursionDepth)
+    return nullptr;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    // Handle constants and arguments
+    Known = computeKnownFPClass(V, fcAllFlags, CxtI, Depth + 1);
+    Value *FoldedToConst =
+        getFPClassConstant(VTy, DemandedMask & Known.KnownFPClasses);
+    return FoldedToConst == V ? nullptr : FoldedToConst;
+  }
+
+  if (!I->hasOneUse())
+    return nullptr;
+
+  // TODO: Should account for nofpclass/FastMathFlags on current instruction
+  switch (I->getOpcode()) {
+  case Instruction::FNeg: {
+    if (SimplifyDemandedFPClass(I, 0, llvm::fneg(DemandedMask), Known,
+                                Depth + 1))
+      return I;
+    Known.fneg();
+    break;
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(I);
+    switch (CI->getIntrinsicID()) {
+    case Intrinsic::fabs:
+      if (SimplifyDemandedFPClass(I, 0, llvm::inverse_fabs(DemandedMask), Known,
+                                  Depth + 1))
+        return I;
+      Known.fabs();
+      break;
+    case Intrinsic::arithmetic_fence:
+      if (SimplifyDemandedFPClass(I, 0, DemandedMask, Known, Depth + 1))
+        return I;
+      break;
+    case Intrinsic::copysign: {
+      // Flip on more potentially demanded classes
+      const FPClassTest DemandedMaskAnySign = llvm::unknown_sign(DemandedMask);
+      if (SimplifyDemandedFPClass(I, 0, DemandedMaskAnySign, Known, Depth + 1))
+        return I;
+
+      if ((DemandedMask & fcPositive) == fcNone) {
+        // Roundabout way of replacing with fneg(fabs)
+        I->setOperand(1, ConstantFP::get(VTy, -1.0));
+        return I;
+      }
+
+      if ((DemandedMask & fcNegative) == fcNone) {
+        // Roundabout way of replacing with fabs
+        I->setOperand(1, ConstantFP::getZero(VTy));
+        return I;
+      }
+
+      KnownFPClass KnownSign =
+          computeKnownFPClass(I->getOperand(1), fcAllFlags, CxtI, Depth + 1);
+      Known.copysign(KnownSign);
+      break;
+    }
+    default:
+      Known = computeKnownFPClass(I, ~DemandedMask, CxtI, Depth + 1);
+      break;
+    }
+
+    break;
+  }
+  case Instruction::Select: {
+    KnownFPClass KnownLHS, KnownRHS;
+    if (SimplifyDemandedFPClass(I, 2, DemandedMask, KnownRHS, Depth + 1) ||
+        SimplifyDemandedFPClass(I, 1, DemandedMask, KnownLHS, Depth + 1))
+      return I;
+
+    if (KnownLHS.isKnownNever(DemandedMask))
+      return I->getOperand(2);
+    if (KnownRHS.isKnownNever(DemandedMask))
+      return I->getOperand(1);
+
+    // TODO: Recognize clamping patterns
+    Known = KnownLHS | KnownRHS;
+    break;
+  }
+  default:
+    Known = computeKnownFPClass(I, ~DemandedMask, CxtI, Depth + 1);
+    break;
+  }
+
+  return getFPClassConstant(VTy, DemandedMask & Known.KnownFPClasses);
+}
+
+bool InstCombinerImpl::SimplifyDemandedFPClass(Instruction *I, unsigned OpNo,
+                                               FPClassTest DemandedMask,
+                                               KnownFPClass &Known,
+                                               unsigned Depth) {
+  Use &U = I->getOperandUse(OpNo);
+  Value *NewVal =
+      SimplifyDemandedUseFPClass(U.get(), DemandedMask, Known, Depth, I);
+  if (!NewVal)
+    return false;
+  if (Instruction *OpInst = dyn_cast<Instruction>(U))
+    salvageDebugInfo(*OpInst);
+
+  replaceUse(U, NewVal);
+  return true;
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 69d7593d5fc1e3..c8b58c51d4e6ec 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2122,8 +2122,8 @@ static Instruction *foldSelectShuffleOfSelectShuffle(ShuffleVectorInst &Shuf) {
     NewMask[i] = Mask[i] < (signed)NumElts ? Mask[i] : Mask1[i];
 
   // A select mask with undef elements might look like an identity mask.
-  assert((ShuffleVectorInst::isSelectMask(NewMask) ||
-          ShuffleVectorInst::isIdentityMask(NewMask)) &&
+  assert((ShuffleVectorInst::isSelectMask(NewMask, NumElts) ||
+          ShuffleVectorInst::isIdentityMask(NewMask, NumElts)) &&
          "Unexpected shuffle mask");
   return new ShuffleVectorInst(X, Y, NewMask);
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f6aabe17e06eb4..0ee29149a1aeef 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2732,8 +2732,22 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI, Value *Op) {
 }
 
 Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
-  // Nothing for now.
-  return nullptr;
+  Value *RetVal = RI.getReturnValue();
+  if (!RetVal || !AttributeFuncs::isNoFPClassCompatibleType(RetVal->getType()))
+    return nullptr;
+
+  Function *F = RI.getFunction();
+  FPClassTest ReturnClass = F->getAttributes().getRetNoFPClass();
+  if (ReturnClass == fcNone)
+    return nullptr;
+
+  KnownFPClass KnownClass;
+  Value *Simplified =
+      SimplifyDemandedUseFPClass(RetVal, ~ReturnClass, KnownClass, 0, &RI);
+  if (!Simplified)
+    return nullptr;
+
+  return ReturnInst::Create(RI.getContext(), Simplified);
 }
 
 // WARNING: keep in sync with SimplifyCFGOpt::simplifyUnreachable()!
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index e80ee1953de6b2..9e9b192d442ce5 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -819,11 +819,11 @@ class ModuleAddressSanitizer {
 private:
   void initializeCallbacks(Module &M);
 
-  void instrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
+  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
   void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
                              ArrayRef<GlobalVariable *> ExtendedGlobals,
                              ArrayRef<Constant *> MetadataInitializers);
-  void instrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
+  void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
                             ArrayRef<GlobalVariable *> ExtendedGlobals,
                             ArrayRef<Constant *> MetadataInitializers,
                             const std::string &UniqueModuleId);
@@ -2177,7 +2177,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
     appendToCompilerUsed(M, MetadataGlobals);
 }
 
-void ModuleAddressSanitizer::instrumentGlobalsELF(
+void ModuleAddressSanitizer::InstrumentGlobalsELF(
     IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
     ArrayRef<Constant *> MetadataInitializers,
     const std::string &UniqueModuleId) {
@@ -2187,7 +2187,7 @@ void ModuleAddressSanitizer::instrumentGlobalsELF(
   // false negative odr violations at link time. If odr indicators are used, we
   // keep the comdat sections, as link time odr violations will be dectected on
   // the odr indicator symbols.
-  bool UseComdatForGlobalsGC = UseOdrIndicator && !UniqueModuleId.empty();
+  bool UseComdatForGlobalsGC = UseOdrIndicator;
 
   SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
   for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
@@ -2237,7 +2237,7 @@ void ModuleAddressSanitizer::instrumentGlobalsELF(
 
   // We also need to unregister globals at the end, e.g., when a shared library
   // gets closed.
-  if (DestructorKind != AsanDtorKind::None && !MetadataGlobals.empty()) {
+  if (DestructorKind != AsanDtorKind::None) {
     IRBuilder<> IrbDtor(CreateAsanModuleDtor(M));
     IrbDtor.CreateCall(AsanUnregisterElfGlobals,
                        {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
@@ -2343,8 +2343,10 @@ void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
 // redzones and inserts this function into llvm.global_ctors.
 // Sets *CtorComdat to true if the global registration code emitted into the
 // asan constructor is comdat-compatible.
-void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, Module &M,
+bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
                                                bool *CtorComdat) {
+  *CtorComdat = false;
+
   // Build set of globals that are aliased by some GA, where
   // getExcludedAliasedGlobal(GA) returns the relevant GlobalVariable.
   SmallPtrSet<const GlobalVariable *, 16> AliasedGlobalExclusions;
@@ -2362,6 +2364,11 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, Module &M,
   }
 
   size_t n = GlobalsToChange.size();
+  if (n == 0) {
+    *CtorComdat = true;
+    return false;
+  }
+
   auto &DL = M.getDataLayout();
 
   // A global is described by a structure
@@ -2384,11 +2391,8 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, Module &M,
 
   // We shouldn't merge same module names, as this string serves as unique
   // module ID in runtime.
-  GlobalVariable *ModuleName =
-      n != 0
-          ? createPrivateGlobalForString(M, M.getModuleIdentifier(),
-                                         /*AllowMerging*/ false, kAsanGenPrefix)
-          : nullptr;
+  GlobalVariable *ModuleName = createPrivateGlobalForString(
+      M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
 
   for (size_t i = 0; i < n; i++) {
     GlobalVariable *G = GlobalsToChange[i];
@@ -2513,27 +2517,19 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, Module &M,
   }
   appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList));
 
-  if (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) {
-    // Use COMDAT and register globals even if n == 0 to ensure that (a) the
-    // linkage unit will only have one module constructor, and (b) the register
-    // function will be called. The module destructor is not created when n ==
-    // 0.
+  std::string ELFUniqueModuleId =
+      (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
+                                                        : "";
+
+  if (!ELFUniqueModuleId.empty()) {
+    InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
     *CtorComdat = true;
-    instrumentGlobalsELF(IRB, M, NewGlobals, Initializers,
-                         getUniqueModuleId(&M));
-  } else if (n == 0) {
-    // When UseGlobalsGC is false, COMDAT can still be used if n == 0, because
-    // all compile units will have identical module constructor/destructor.
-    *CtorComdat = TargetTriple.isOSBinFormatELF();
+  } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
+    InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
+  } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
+    InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
   } else {
-    *CtorComdat = false;
-    if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
-      InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
-    } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
-      InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
-    } else {
-      InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
-    }
+    InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
   }
 
   // Create calls for poisoning before initializers run and unpoisoning after.
@@ -2541,6 +2537,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, Module &M,
     createInitializerPoisonCalls(M, ModuleName);
 
   LLVM_DEBUG(dbgs() << M);
+  return true;
 }
 
 uint64_t
@@ -2604,10 +2601,10 @@ bool ModuleAddressSanitizer::instrumentModule(Module &M) {
     assert(AsanCtorFunction || ConstructorKind == AsanCtorKind::None);
     if (AsanCtorFunction) {
       IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
-      instrumentGlobals(IRB, M, &CtorComdat);
+      InstrumentGlobals(IRB, M, &CtorComdat);
     } else {
       IRBuilder<> IRB(*C);
-      instrumentGlobals(IRB, M, &CtorComdat);
+      InstrumentGlobals(IRB, M, &CtorComdat);
     }
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
index d53e12ad1ff5c3..e2e5f21b376b03 100644
--- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -66,7 +66,7 @@ static bool runCGProfilePass(
     if (F.isDeclaration() || !F.getEntryCount())
       continue;
     auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-    if (BFI.getEntryFreq() == 0)
+    if (BFI.getEntryFreq() == BlockFrequency(0))
       continue;
     TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
     for (auto &BB : F) {
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 2906fe19098401..fd0f69eca96e6c 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -378,7 +378,7 @@ bool MemOPSizeOpt::perform(MemOp MO) {
   assert(It != DefaultBB->end());
   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
   MergeBB->setName("MemOP.Merge");
-  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
+  BFI.setBlockFreq(MergeBB, OrigBBFreq);
   DefaultBB->setName("MemOP.Default");
 
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index bca37da5ff10f2..e71e7eeb8ef1e1 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -859,18 +859,18 @@ void State::addInfoForInductions(BasicBlock &BB) {
     return;
 
   auto *AR = dyn_cast_or_null<SCEVAddRecExpr>(SE.getSCEV(PN));
-  if (!AR)
+  BasicBlock *LoopPred = L->getLoopPredecessor();
+  if (!AR || !LoopPred)
     return;
 
   const SCEV *StartSCEV = AR->getStart();
   Value *StartValue = nullptr;
-  if (auto *C = dyn_cast<SCEVConstant>(StartSCEV))
+  if (auto *C = dyn_cast<SCEVConstant>(StartSCEV)) {
     StartValue = C->getValue();
-  else if (auto *U = dyn_cast<SCEVUnknown>(StartSCEV))
-    StartValue = U->getValue();
-
-  if (!StartValue)
-    return;
+  } else {
+    StartValue = PN->getIncomingValueForBlock(LoopPred);
+    assert(SE.getSCEV(StartValue) == StartSCEV && "inconsistent start value");
+  }
 
   DomTreeNode *DTN = DT.getNode(InLoopSucc);
   auto Inc = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_UGT);
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index bc54846ccf0ad2..4c5b0ff3af16e5 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1472,6 +1472,7 @@ void GVNPass::eliminatePartiallyRedundantLoad(
   // Perform PHI construction.
   Value *V = ConstructSSAForLoadSet(Load, ValuesPerBlock, *this);
   // ConstructSSAForLoadSet is responsible for combining metadata.
+  ICF->removeUsersOf(Load);
   Load->replaceAllUsesWith(V);
   if (isa<PHINode>(V))
     V->takeName(Load);
@@ -1891,6 +1892,7 @@ bool GVNPass::processNonLocalLoad(LoadInst *Load) {
     // Perform PHI construction.
     Value *V = ConstructSSAForLoadSet(Load, ValuesPerBlock, *this);
     // ConstructSSAForLoadSet is responsible for combining metadata.
+    ICF->removeUsersOf(Load);
     Load->replaceAllUsesWith(V);
 
     if (isa<PHINode>(V))
@@ -2165,6 +2167,7 @@ bool GVNPass::processLoad(LoadInst *L) {
   Value *AvailableValue = AV->MaterializeAdjustedValue(L, L, *this);
 
   // MaterializeAdjustedValue is responsible for combining metadata.
+  ICF->removeUsersOf(L);
   L->replaceAllUsesWith(AvailableValue);
   markInstructionForDeletion(L);
   if (MSSAU)
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index ed69f1938ec792..8c3ff399621a88 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -2251,7 +2251,7 @@ void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
     assert(BPI && "It's expected BPI to exist along with BFI");
     auto NewBBFreq = BFI->getBlockFreq(PredPredBB) *
                      BPI->getEdgeProbability(PredPredBB, PredBB);
-    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+    BFI->setBlockFreq(NewBB, NewBBFreq);
   }
 
   // We are going to have to map operands from the original BB block to the new
@@ -2377,7 +2377,7 @@ void JumpThreadingPass::threadEdge(BasicBlock *BB,
     assert(BPI && "It's expected BPI to exist along with BFI");
     auto NewBBFreq =
         BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
-    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+    BFI->setBlockFreq(NewBB, NewBBFreq);
   }
 
   // Copy all the instructions from BB to NewBB except the terminator.
@@ -2462,7 +2462,7 @@ BasicBlock *JumpThreadingPass::splitBlockPreds(BasicBlock *BB,
         NewBBFreq += FreqMap.lookup(Pred);
     }
     if (BFI) // Apply the summed frequency to NewBB.
-      BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+      BFI->setBlockFreq(NewBB, NewBBFreq);
   }
 
   DTU->applyUpdatesPermissive(Updates);
@@ -2502,7 +2502,7 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
   auto NewBBFreq = BFI->getBlockFreq(NewBB);
   auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
   auto BBNewFreq = BBOrigFreq - NewBBFreq;
-  BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+  BFI->setBlockFreq(BB, BBNewFreq);
 
   // Collect updated outgoing edges' frequencies from BB and use them to update
   // edge probabilities.
@@ -2760,7 +2760,7 @@ void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
     BranchProbability PredToNewBBProb = BranchProbability::getBranchProbability(
         TrueWeight, TrueWeight + FalseWeight);
     auto NewBBFreq = BFI->getBlockFreq(Pred) * PredToNewBBProb;
-    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+    BFI->setBlockFreq(NewBB, NewBBFreq);
   }
 
   // The select is now dead.
diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp
index 4d51f0566310c1..9f99f6214748bc 100644
--- a/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -79,7 +79,7 @@ static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
 ///     AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
 static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
                                       BlockFrequencyInfo &BFI) {
-  BlockFrequency T = 0;
+  BlockFrequency T(0);
   for (BasicBlock *B : BBs)
     T += BFI.getBlockFreq(B);
   if (BBs.size() > 1)
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index cd3ef42f8f29c2..5fb796cc3db628 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -387,7 +387,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
     if (ShouldUpdateAnalysis) {
       // Copy the BFI/BPI from Target to BodyBlock.
       BPI->setEdgeProbability(BodyBlock, EdgeProbabilities);
-      BFI->setBlockFreq(BodyBlock, BFI->getBlockFreq(Target).getFrequency());
+      BFI->setBlockFreq(BodyBlock, BFI->getBlockFreq(Target));
     }
     // It's possible Target was its own successor through an indirectbr.
     // In this case, the indirectbr now comes from BodyBlock.
@@ -411,10 +411,10 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
             BPI->getEdgeProbability(Src, DirectSucc);
     }
     if (ShouldUpdateAnalysis) {
-      BFI->setBlockFreq(DirectSucc, BlockFreqForDirectSucc.getFrequency());
+      BFI->setBlockFreq(DirectSucc, BlockFreqForDirectSucc);
       BlockFrequency NewBlockFreqForTarget =
           BFI->getBlockFreq(Target) - BlockFreqForDirectSucc;
-      BFI->setBlockFreq(Target, NewBlockFreqForTarget.getFrequency());
+      BFI->setBlockFreq(Target, NewBlockFreqForTarget);
     }
 
     // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index cafa99491f5b5f..08b2b01b2ee1e2 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1579,6 +1579,12 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
       DebugIntrinsicsToDelete.push_back(DVI);
       continue;
     }
+    // DbgAssign intrinsics have an extra Value argument:
+    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI);
+        DAI && IsInvalidLocation(DAI->getAddress())) {
+      DebugIntrinsicsToDelete.push_back(DVI);
+      continue;
+    }
     // If the variable was in the scope of the old function, i.e. it was not
     // inlined, point the intrinsic to a fresh variable within the new function.
     if (!DVI->getDebugLoc().getInlinedAt()) {
@@ -1773,11 +1779,11 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
 
   // Update the entry count of the function.
   if (BFI) {
-    auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
+    auto Count = BFI->getProfileCountFromFreq(EntryFreq);
     if (Count)
       newFunction->setEntryCount(
           ProfileCount(*Count, Function::PCT_Real)); // FIXME
-    BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
+    BFI->setBlockFreq(codeReplacer, EntryFreq);
   }
 
   CallInst *TheCall =
diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index dab0be3a9fdeb9..0990c750af55f1 100644
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -91,18 +91,16 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
                                                    Mappings.end());
 
   auto AddVariantDecl = [&](const ElementCount &VF, bool Predicate) {
-    const std::string TLIName =
-        std::string(TLI.getVectorizedFunction(ScalarName, VF, Predicate));
-    if (!TLIName.empty()) {
-      std::string MangledName = VFABI::mangleTLIVectorName(
-          TLIName, ScalarName, CI.arg_size(), VF, Predicate);
+    const VecDesc *VD = TLI.getVectorMappingInfo(ScalarName, VF, Predicate);
+    if (VD && !VD->getVectorFnName().empty()) {
+      std::string MangledName = VD->getVectorFunctionABIVariantString();
       if (!OriginalSetOfMappings.count(MangledName)) {
         Mappings.push_back(MangledName);
         ++NumCallInjected;
       }
-      Function *VariantF = M->getFunction(TLIName);
+      Function *VariantF = M->getFunction(VD->getVectorFnName());
       if (!VariantF)
-        addVariantDeclaration(CI, VF, Predicate, TLIName);
+        addVariantDeclaration(CI, VF, Predicate, VD->getVectorFnName());
     }
   };
 
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 548f949277952c..7ea51aab7dc1fc 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1358,6 +1358,8 @@ static AttrBuilder IdentifyValidUBGeneratingAttributes(CallBase &CB) {
     Valid.addDereferenceableOrNullAttr(DerefOrNullBytes);
   if (CB.hasRetAttr(Attribute::NoAlias))
     Valid.addAttribute(Attribute::NoAlias);
+  if (CB.hasRetAttr(Attribute::NoUndef))
+    Valid.addAttribute(Attribute::NoUndef);
   return Valid;
 }
 
@@ -1367,6 +1369,8 @@ static AttrBuilder IdentifyValidPoisonGeneratingAttributes(CallBase &CB) {
   AttrBuilder Valid(CB.getContext());
   if (CB.hasRetAttr(Attribute::NonNull))
     Valid.addAttribute(Attribute::NonNull);
+  if (CB.hasRetAttr(Attribute::Alignment))
+    Valid.addAlignmentAttr(CB.getRetAlign());
   return Valid;
 }
 
@@ -1417,6 +1421,11 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     // existing attribute value (i.e. attributes such as dereferenceable,
     // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
     AttributeList AL = NewRetVal->getAttributes();
+    if (ValidUB.getDereferenceableBytes() < AL.getRetDereferenceableBytes())
+      ValidUB.removeAttribute(Attribute::Dereferenceable);
+    if (ValidUB.getDereferenceableOrNullBytes() <
+        AL.getRetDereferenceableOrNullBytes())
+      ValidUB.removeAttribute(Attribute::DereferenceableOrNull);
     AttributeList NewAL = AL.addRetAttributes(Context, ValidUB);
     // Attributes that may generate poison returns are a bit tricky. If we
     // propagate them, other uses of the callsite might have their behavior
@@ -1450,6 +1459,8 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     // any new poison at @use will trigger UB anyways.
     // In case 3, we can never propagate nonnull because it may create UB due to
     // the noundef on @bar.
+    if (ValidPG.getAlignment().valueOrOne() < AL.getRetAlignment().valueOrOne())
+      ValidPG.removeAttribute(Attribute::Alignment);
     if (ValidPG.hasAttributes()) {
       // Three checks.
       // If the callsite has `noundef`, then a poison due to violating the
@@ -1819,12 +1830,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
       continue;
     auto *OrigBB = cast<BasicBlock>(Entry.first);
     auto *ClonedBB = cast<BasicBlock>(Entry.second);
-    uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency();
+    BlockFrequency Freq = CalleeBFI->getBlockFreq(OrigBB);
     if (!ClonedBBs.insert(ClonedBB).second) {
       // Multiple blocks in the callee might get mapped to one cloned block in
       // the caller since we prune the callee as we clone it. When that happens,
       // we want to use the maximum among the original blocks' frequencies.
-      uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency();
+      BlockFrequency NewFreq = CallerBFI->getBlockFreq(ClonedBB);
       if (NewFreq > Freq)
         Freq = NewFreq;
     }
@@ -1832,8 +1843,7 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
   }
   BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
   CallerBFI->setBlockFreqAndScale(
-      EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(),
-      ClonedBBs);
+      EntryClone, CallerBFI->getBlockFreq(CallSiteBlock), ClonedBBs);
 }
 
 /// Update the branch metadata for cloned call instructions.
@@ -2800,8 +2810,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
   if (IFI.CallerBFI) {
     // Copy original BB's block frequency to AfterCallBB
-    IFI.CallerBFI->setBlockFreq(
-        AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency());
+    IFI.CallerBFI->setBlockFreq(AfterCallBB,
+                                IFI.CallerBFI->getBlockFreq(OrigBB));
   }
 
   // Change the branch that used to go to AfterCallBB to branch to the first
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 8a67fda7c98e78..4b96b02ee2ecd6 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -1225,10 +1225,12 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
 
   if (Constant *OpC = getConstant(OpSt, I.getOperand(0)->getType())) {
     // Fold the constant as we build.
-    Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL);
-    markConstant(&I, C);
-  } else if (I.getDestTy()->isIntegerTy() &&
-             I.getSrcTy()->isIntOrIntVectorTy()) {
+    if (Constant *C =
+            ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL))
+      return (void)markConstant(&I, C);
+  }
+
+  if (I.getDestTy()->isIntegerTy() && I.getSrcTy()->isIntOrIntVectorTy()) {
     auto &LV = getValueState(&I);
     ConstantRange OpRange = getConstantRange(OpSt, I.getSrcTy());
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cc17d91d4f4372..53ad37bf3599b5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3818,34 +3818,16 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   // instead of the former. For an inloop reduction the reduction will already
   // be predicated, and does not need to be handled here.
   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
-      SelectInst *Sel = nullptr;
-      for (User *U : VecLoopExitInst->users()) {
-        if (isa<SelectInst>(U)) {
-          assert((!Sel || U == Sel) &&
-                 "Reduction exit feeding two different selects");
-          Sel = cast<SelectInst>(U);
-        } else
-          assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
-      }
-      assert(Sel && "Reduction exit feeds no select");
-      State.reset(LoopExitInstDef, Sel, Part);
-
-      // If the target can create a predicated operator for the reduction at no
-      // extra cost in the loop (for example a predicated vadd), it can be
-      // cheaper for the select to remain in the loop than be sunk out of it,
-      // and so use the select value for the phi instead of the old
-      // LoopExitValue.
-      if (PreferPredicatedReductionSelect ||
-          TTI->preferPredicatedReductionSelect(
-              RdxDesc.getOpcode(), PhiTy,
-              TargetTransformInfo::ReductionFlags())) {
-        auto *VecRdxPhi =
-            cast<PHINode>(State.get(PhiR, Part));
-        VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
+    VPValue *Def = nullptr;
+    for (VPUser *U : LoopExitInstDef->users()) {
+      auto *S = dyn_cast<VPInstruction>(U);
+      if (S && S->getOpcode() == Instruction::Select) {
+        Def = S;
+        break;
       }
     }
+    if (Def)
+      LoopExitInstDef = Def;
   }
 
   // If the vector reduction can be performed in a smaller type, we truncate
@@ -5142,7 +5124,9 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
                          "exceeding the constant trip count: "
                       << ClampedConstTripCount << "\n");
-    return ElementCount::getFixed(ClampedConstTripCount);
+    return ElementCount::get(
+        ClampedConstTripCount,
+        FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
   }
 
   TargetTransformInfo::RegisterKind RegKind =
@@ -9099,6 +9083,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
               ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
               : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
       Select->insertBefore(&*Builder.getInsertPoint());
+      if (PreferPredicatedReductionSelect ||
+          TTI.preferPredicatedReductionSelect(
+              PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
+              TargetTransformInfo::ReductionFlags()))
+        PhiR->setOperand(1, Select);
     }
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2884de6ac1d2dd..a010ffb3c9d789 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3812,7 +3812,7 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
     inversePermutation(Order, MaskOrder);
   }
   reorderReuses(MaskOrder, Mask);
-  if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
+  if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {
     Order.clear();
     return;
   }
@@ -4333,7 +4333,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
 static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
                                                unsigned Sz) {
   ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
-  if (ShuffleVectorInst::isIdentityMask(FirstCluster))
+  if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
     return false;
   for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
     ArrayRef<int> Cluster = Mask.slice(I, Sz);
@@ -6646,9 +6646,24 @@ class BaseShuffleAnalysis {
                              bool IsStrict) {
     int Limit = Mask.size();
     int VF = VecTy->getNumElements();
-    return (VF == Limit || !IsStrict) &&
-           all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
-           ShuffleVectorInst::isIdentityMask(Mask);
+    int Index = -1;
+    if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
+      return true;
+    if (!IsStrict) {
+      // Consider extract subvector starting from index 0.
+      if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
+          Index == 0)
+        return true;
+      // All VF-size submasks are identity (e.g.
+      // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
+      if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
+            ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
+            return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
+                   ShuffleVectorInst::isIdentityMask(Slice, VF);
+          }))
+        return true;
+    }
+    return false;
   }
 
   /// Tries to combine 2 different masks into single one.
@@ -6718,7 +6733,8 @@ class BaseShuffleAnalysis {
       if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
         if (!IdentityOp || !SinglePermute ||
             (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
-             !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) {
+             !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
+                                                    IdentityMask.size()))) {
           IdentityOp = SV;
           // Store current mask in the IdentityMask so later we did not lost
           // this info if IdentityOp is selected as the best candidate for the
@@ -6788,7 +6804,7 @@ class BaseShuffleAnalysis {
     }
     if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
         !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
-        ShuffleVectorInst::isZeroEltSplatMask(Mask)) {
+        ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
       if (IdentityOp) {
         V = IdentityOp;
         assert(Mask.size() == IdentityMask.size() &&
@@ -6804,7 +6820,7 @@ class BaseShuffleAnalysis {
                                /*IsStrict=*/true) ||
                 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
                  Shuffle->isZeroEltSplat() &&
-                 ShuffleVectorInst::isZeroEltSplatMask(Mask)));
+                 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));
       }
       V = Op;
       return false;
@@ -6909,11 +6925,9 @@ class BaseShuffleAnalysis {
           CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
         }
       }
-      const int Limit = CombinedMask1.size() * 2;
-      if (Op1 == Op2 && Limit == 2 * VF &&
-          all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&
-          (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||
-           (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&
+      if (Op1 == Op2 &&
+          (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
+           (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
             isa<ShuffleVectorInst>(Op1) &&
             cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
                 ArrayRef(CombinedMask1))))
@@ -6953,6 +6967,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   SmallPtrSetImpl<Value *> &CheckedExtracts;
   constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
+  static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
+    if (Ty->getScalarType()->isPointerTy()) {
+      Constant *Res = ConstantExpr::getIntToPtr(
+          ConstantInt::getAllOnesValue(
+              IntegerType::get(Ty->getContext(),
+                               DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
+          Ty->getScalarType());
+      if (auto *VTy = dyn_cast<VectorType>(Ty))
+        Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
+      return Res;
+    }
+    return Constant::getAllOnesValue(Ty);
+  }
+
   InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
     if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
       return TTI::TCC_Free;
@@ -7093,7 +7121,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       NumElts = std::max(NumElts, VecTy->getNumElements());
     }
     assert(NumElts > 0 &&
-           "Expected at least 2-element fixed length vector(s).");
+           "Expected at least 1-element fixed length vector(s).");
     auto *VecTy = FixedVectorType::get(VL.front()->getType(), NumElts);
     unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
     if (!NumOfParts || NumElts < NumOfParts)
@@ -7183,11 +7211,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     const TargetTransformInfo &TTI;
 
     static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
-      int Limit = 2 * VF;
+      int Index = -1;
       return Mask.empty() ||
              (VF == Mask.size() &&
-              all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
-              ShuffleVectorInst::isIdentityMask(Mask));
+              ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
+             (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
+              Index == 0);
     }
 
   public:
@@ -7200,21 +7229,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
       if (isEmptyOrIdentity(Mask, VF))
         return TTI::TCC_Free;
-      return TTI.getShuffleCost(
-          TTI::SK_PermuteTwoSrc,
-          FixedVectorType::get(
-              cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
-          Mask);
+      return TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
+                                cast<VectorType>(V1->getType()), Mask);
     }
     InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
       // Empty mask or identity mask are free.
-      if (isEmptyOrIdentity(Mask, Mask.size()))
+      unsigned VF =
+          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
+      if (isEmptyOrIdentity(Mask, VF))
         return TTI::TCC_Free;
-      return TTI.getShuffleCost(
-          TTI::SK_PermuteSingleSrc,
-          FixedVectorType::get(
-              cast<VectorType>(V1->getType())->getElementType(), Mask.size()),
-          Mask);
+      return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,
+                                cast<VectorType>(V1->getType()), Mask);
     }
     InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
     InstructionCost createPoison(Type *Ty, unsigned VF) const {
@@ -7231,47 +7256,119 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                 const PointerUnion<Value *, const TreeEntry *> &P2,
                 ArrayRef<int> Mask) {
     ShuffleCostBuilder Builder(TTI);
+    SmallVector<int> CommonMask(Mask.begin(), Mask.end());
     Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
-    unsigned CommonVF = 0;
-    if (!V1) {
+    unsigned CommonVF = Mask.size();
+    if (!V1 && !V2 && !P2.isNull()) {
+      // Shuffle 2 entry nodes.
       const TreeEntry *E = P1.get<const TreeEntry *>();
       unsigned VF = E->getVectorFactor();
-      if (V2) {
-        unsigned V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
-        if (V2VF != VF && V2VF == E->Scalars.size())
-          VF = E->Scalars.size();
-      } else if (!P2.isNull()) {
-        const TreeEntry *E2 = P2.get<const TreeEntry *>();
-        if (E->Scalars.size() == E2->Scalars.size())
-          CommonVF = VF = E->Scalars.size();
-      } else {
-        // P2 is empty, check that we have same node + reshuffle (if any).
-        if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
-          VF = E->Scalars.size();
-          SmallVector<int> CommonMask(Mask.begin(), Mask.end());
-          ::addMask(CommonMask, E->getCommonMask());
-          V1 = Constant::getNullValue(
-              FixedVectorType::get(E->Scalars.front()->getType(), VF));
-          return BaseShuffleAnalysis::createShuffle<InstructionCost>(
-              V1, nullptr, CommonMask, Builder);
+      const TreeEntry *E2 = P2.get<const TreeEntry *>();
+      CommonVF = std::max(VF, E2->getVectorFactor());
+      assert(all_of(Mask,
+                    [=](int Idx) {
+                      return Idx < 2 * static_cast<int>(CommonVF);
+                    }) &&
+             "All elements in mask must be less than 2 * CommonVF.");
+      if (E->Scalars.size() == E2->Scalars.size()) {
+        SmallVector<int> EMask = E->getCommonMask();
+        SmallVector<int> E2Mask = E2->getCommonMask();
+        if (!EMask.empty() || !E2Mask.empty()) {
+          for (int &Idx : CommonMask) {
+            if (Idx == PoisonMaskElem)
+              continue;
+            if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
+              Idx = EMask[Idx];
+            else if (Idx >= static_cast<int>(CommonVF))
+              Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
+                    E->Scalars.size();
+          }
         }
+        CommonVF = E->Scalars.size();
       }
       V1 = Constant::getNullValue(
-          FixedVectorType::get(E->Scalars.front()->getType(), VF));
-    }
-    if (!V2 && !P2.isNull()) {
-      const TreeEntry *E = P2.get<const TreeEntry *>();
+          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      V2 = getAllOnesValue(
+          *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+    } else if (!V1 && P2.isNull()) {
+      // Shuffle single entry node.
+      const TreeEntry *E = P1.get<const TreeEntry *>();
       unsigned VF = E->getVectorFactor();
-      unsigned V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
-      if (!CommonVF && V1VF == E->Scalars.size())
+      CommonVF = VF;
+      assert(
+          all_of(Mask,
+                 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
+          "All elements in mask must be less than CommonVF.");
+      if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
+        SmallVector<int> EMask = E->getCommonMask();
+        assert(!EMask.empty() && "Expected non-empty common mask.");
+        for (int &Idx : CommonMask) {
+          if (Idx != PoisonMaskElem)
+            Idx = EMask[Idx];
+        }
         CommonVF = E->Scalars.size();
-      if (CommonVF)
-        VF = CommonVF;
-      V2 = Constant::getNullValue(
-          FixedVectorType::get(E->Scalars.front()->getType(), VF));
-    }
-    return BaseShuffleAnalysis::createShuffle<InstructionCost>(V1, V2, Mask,
-                                                               Builder);
+      }
+      V1 = Constant::getNullValue(
+          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+    } else if (V1 && P2.isNull()) {
+      // Shuffle single vector.
+      CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
+      assert(
+          all_of(Mask,
+                 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
+          "All elements in mask must be less than CommonVF.");
+    } else if (V1 && !V2) {
+      // Shuffle vector and tree node.
+      unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+      const TreeEntry *E2 = P2.get<const TreeEntry *>();
+      CommonVF = std::max(VF, E2->getVectorFactor());
+      assert(all_of(Mask,
+                    [=](int Idx) {
+                      return Idx < 2 * static_cast<int>(CommonVF);
+                    }) &&
+             "All elements in mask must be less than 2 * CommonVF.");
+      if (E2->Scalars.size() == VF && VF != CommonVF) {
+        SmallVector<int> E2Mask = E2->getCommonMask();
+        assert(!E2Mask.empty() && "Expected non-empty common mask.");
+        for (int &Idx : CommonMask) {
+          if (Idx == PoisonMaskElem)
+            continue;
+          if (Idx >= static_cast<int>(CommonVF))
+            Idx = E2Mask[Idx - CommonVF] + VF;
+        }
+        CommonVF = VF;
+      }
+      V1 = Constant::getNullValue(
+          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+      V2 = getAllOnesValue(
+          *R.DL,
+          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+    } else {
+      assert(V1 && V2 && "Expected both vectors.");
+      unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
+      CommonVF =
+          std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
+      assert(all_of(Mask,
+                    [=](int Idx) {
+                      return Idx < 2 * static_cast<int>(CommonVF);
+                    }) &&
+             "All elements in mask must be less than 2 * CommonVF.");
+      if (V1->getType() != V2->getType()) {
+        V1 = Constant::getNullValue(FixedVectorType::get(
+            cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
+        V2 = getAllOnesValue(
+            *R.DL, FixedVectorType::get(
+                       cast<FixedVectorType>(V1->getType())->getElementType(),
+                       CommonVF));
+      }
+    }
+    InVectors.front() = Constant::getNullValue(FixedVectorType::get(
+        cast<FixedVectorType>(V1->getType())->getElementType(),
+        CommonMask.size()));
+    if (InVectors.size() == 2)
+      InVectors.pop_back();
+    return BaseShuffleAnalysis::createShuffle<InstructionCost>(
+        V1, V2, CommonMask, Builder);
   }
 
 public:
@@ -7415,7 +7512,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     return ConstantVector::getSplat(
         ElementCount::getFixed(VL.size()),
-        Constant::getNullValue(VL.front()->getType()));
+        getAllOnesValue(*R.DL, VL.front()->getType()));
   }
   /// Finalize emission of the shuffles.
   InstructionCost
@@ -7424,31 +7521,22 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     IsFinalized = true;
     if (Action) {
       const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
-      if (InVectors.size() == 2) {
+      if (InVectors.size() == 2)
         Cost += createShuffle(Vec, InVectors.back(), CommonMask);
-        InVectors.pop_back();
-      } else {
+      else
         Cost += createShuffle(Vec, nullptr, CommonMask);
-      }
       for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
         if (CommonMask[Idx] != PoisonMaskElem)
           CommonMask[Idx] = Idx;
       assert(VF > 0 &&
              "Expected vector length for the final value before action.");
-      Value *V = Vec.dyn_cast<Value *>();
-      if (!Vec.isNull() && !V)
-        V = Constant::getNullValue(FixedVectorType::get(
-            Vec.get<const TreeEntry *>()->Scalars.front()->getType(),
-            CommonMask.size()));
+      Value *V = Vec.get<Value *>();
       Action(V, CommonMask);
+      InVectors.front() = V;
     }
     ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
     if (CommonMask.empty())
       return Cost;
-    int Limit = CommonMask.size() * 2;
-    if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
-        ShuffleVectorInst::isIdentityMask(CommonMask))
-      return Cost;
     return Cost +
            createShuffle(InVectors.front(),
                          InVectors.size() == 2 ? InVectors.back() : nullptr,
@@ -7624,7 +7712,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   if (NeedToShuffleReuses)
     ::addMask(Mask, E->ReuseShuffleIndices);
-  if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
+  if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
     CommonCost =
         TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
   assert((E->State == TreeEntry::Vectorize ||
@@ -7975,7 +8063,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         } else {
           SmallVector<int> Mask;
           inversePermutation(OpTE->ReorderIndices, Mask);
-          if (ShuffleVectorInst::isReverseMask(Mask))
+          if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
             CCH = TTI::CastContextHint::Reversed;
         }
       }
@@ -8834,9 +8922,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     unsigned VecVF = TE->getVectorFactor();
     if (VF != VecVF &&
         (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
-         (all_of(Mask,
-                 [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
-          !ShuffleVectorInst::isIdentityMask(Mask)))) {
+         !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
       SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
       std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
                 OrigMask.begin());
@@ -8855,19 +8941,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   // Calculate the cost of the reshuffled vectors, if any.
   for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
     Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
-    unsigned VF = ShuffleMasks[I].begin()->second.size();
-    auto *FTy = FixedVectorType::get(
-        cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);
     auto Vector = ShuffleMasks[I].takeVector();
-    auto &&EstimateShufflesCost = [this, FTy,
-                                   &Cost](ArrayRef<int> Mask,
-                                          ArrayRef<const TreeEntry *> TEs) {
+    unsigned VF = 0;
+    auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
+                                    ArrayRef<const TreeEntry *> TEs) {
       assert((TEs.size() == 1 || TEs.size() == 2) &&
              "Expected exactly 1 or 2 tree entries.");
       if (TEs.size() == 1) {
-        int Limit = 2 * Mask.size();
-        if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
-            !ShuffleVectorInst::isIdentityMask(Mask)) {
+        if (VF == 0)
+          VF = TEs.front()->getVectorFactor();
+        auto *FTy =
+            FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
+        if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
+            !all_of(enumerate(Mask), [=](const auto &Data) {
+              return Data.value() == PoisonMaskElem ||
+                     (Data.index() < VF &&
+                      static_cast<int>(Data.index()) == Data.value());
+            })) {
           InstructionCost C =
               TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
           LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
@@ -8878,6 +8968,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           Cost += C;
         }
       } else {
+        if (VF == 0) {
+          if (TEs.front() &&
+              TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
+            VF = TEs.front()->getVectorFactor();
+          else
+            VF = Mask.size();
+        }
+        auto *FTy =
+            FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
         InstructionCost C =
             TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
         LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
@@ -8887,6 +8986,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
                    dbgs() << "SLP: Current total cost = " << Cost << "\n");
         Cost += C;
       }
+      VF = Mask.size();
       return TEs.back();
     };
     (void)performExtractsShuffleAction<const TreeEntry>(
@@ -9643,7 +9743,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         return V1;
       unsigned VF = Mask.size();
       unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
-      if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask))
+      if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
         return V1;
       Value *Vec = Builder.CreateShuffleVector(V1, Mask);
       if (auto *I = dyn_cast<Instruction>(Vec)) {
@@ -10056,9 +10156,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       return false;
     unsigned I =
         *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
-    int Sz = Mask.size();
-    if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&
-        ShuffleVectorInst::isIdentityMask(Mask))
+    if (ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
       std::iota(Mask.begin(), Mask.end(), 0);
     else
       std::fill(Mask.begin(), Mask.end(), I);
@@ -10302,11 +10400,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
              TTI::SK_PermuteSingleSrc &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
-         ShuffleVectorInst::isIdentityMask(ExtractMask)) ||
+         ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
         (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
              TTI::SK_PermuteSingleSrc &&
          none_of(Mask, [&](int I) { return I >= MSz; }) &&
-         ShuffleVectorInst::isIdentityMask(Mask));
+         ShuffleVectorInst::isIdentityMask(Mask, MSz));
     bool EnoughConstsForShuffle =
         IsSingleShuffle &&
         (none_of(GatheredScalars,
@@ -10599,7 +10697,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
           NumElts != NumScalars) {
         if (IsFirstUndef.all()) {
-          if (!ShuffleVectorInst::isIdentityMask(InsertMask)) {
+          if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
             SmallBitVector IsFirstPoison =
                 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
             if (!IsFirstPoison.all()) {
@@ -11422,7 +11520,7 @@ Value *BoUpSLP::vectorizeTree(
             // non-resizing mask.
             if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
                                    ->getNumElements() ||
-                !ShuffleVectorInst::isIdentityMask(Mask))
+                !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
               return CreateShuffle(Vals.front(), nullptr, Mask);
             return Vals.front();
           }
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9b9aaa22a71c6b..286120ed534b77 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -696,20 +696,20 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
     return false;
 
   auto *DestTy = cast<FixedVectorType>(I.getType());
-  unsigned DestNumElts = DestTy->getNumElements();
-  unsigned SrcNumElts = SrcTy->getNumElements();
+  unsigned DestEltSize = DestTy->getScalarSizeInBits();
+  unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
   SmallVector<int, 16> NewMask;
-  if (SrcNumElts <= DestNumElts) {
+  if (DestEltSize <= SrcEltSize) {
     // The bitcast is from wide to narrow/equal elements. The shuffle mask can
     // always be expanded to the equivalent form choosing narrower elements.
-    assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
-    unsigned ScaleFactor = DestNumElts / SrcNumElts;
+    assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
+    unsigned ScaleFactor = SrcEltSize / DestEltSize;
     narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
   } else {
     // The bitcast is from narrow elements to wide elements. The shuffle mask
     // must choose consecutive elements to allow casting first.
-    assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
-    unsigned ScaleFactor = SrcNumElts / DestNumElts;
+    assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
+    unsigned ScaleFactor = DestEltSize / SrcEltSize;
     if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
       return false;
   }
diff --git a/llvm/projects/CMakeLists.txt b/llvm/projects/CMakeLists.txt
index 2f66cd1a4c44b4..08f2fa522420b0 100644
--- a/llvm/projects/CMakeLists.txt
+++ b/llvm/projects/CMakeLists.txt
@@ -6,7 +6,6 @@ foreach(entry ${entries})
   if(IS_DIRECTORY ${entry} AND EXISTS ${entry}/CMakeLists.txt)
     if((NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/dragonegg) AND
-       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/flang-rt) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libcxx) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libcxxabi) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libunwind) AND
@@ -38,8 +37,6 @@ if(${LLVM_BUILD_RUNTIME})
   if(NOT LLVM_BUILD_EXTERNAL_COMPILER_RT)
     add_llvm_external_project(compiler-rt)
   endif()
-
-  add_llvm_external_project(flang-rt)
 endif()
 
 add_llvm_external_project(dragonegg)
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index dc568373e48f0e..7ec6480773c8f0 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -401,24 +401,17 @@ if(runtimes)
       endforeach()
     endif()
   endif()
-  set(EXTRA_CMAKE_ARGS "")
-  if("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
-    list(APPEND EXTRA_CMAKE_ARGS "-DLLVM_DIR=${LLVM_BINARY_DIR}/lib/cmake/llvm")
-    list(APPEND EXTRA_CMAKE_ARGS "-DFLANG_DIR=${LLVM_BINARY_DIR}/lib/cmake/flang")
-    list(APPEND EXTRA_CMAKE_ARGS "-DCLANG_DIR=${LLVM_BINARY_DIR}/lib/cmake/clang")
-    list(APPEND EXTRA_CMAKE_ARGS "-DMLIR_DIR=${LLVM_BINARY_DIR}/lib/cmake/mlir")
-  endif()
   if(NOT LLVM_RUNTIME_TARGETS)
     runtime_default_target(
       DEPENDS ${builtins_dep} ${extra_deps}
-      CMAKE_ARGS ${libc_cmake_args} ${EXTRA_CMAKE_ARGS}
+      CMAKE_ARGS ${libc_cmake_args}
       PREFIXES ${prefixes})
     set(test_targets check-runtimes)
   else()
     if("default" IN_LIST LLVM_RUNTIME_TARGETS)
       runtime_default_target(
         DEPENDS ${builtins_dep} ${extra_deps}
-        CMAKE_ARGS ${libc_cmake_args} ${EXTRA_CMAKE_ARGS}
+        CMAKE_ARGS ${libc_cmake_args}
         PREFIXES ${prefixes})
       list(REMOVE_ITEM LLVM_RUNTIME_TARGETS "default")
     else()
@@ -458,7 +451,7 @@ if(runtimes)
 
       runtime_register_target(${name}
         DEPENDS ${builtins_dep_name} ${libc_tools}
-        CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name} ${libc_cmake_args} ${EXTRA_CMAKE_ARGS}
+        CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name} ${libc_cmake_args}
         EXTRA_ARGS TARGET_TRIPLE ${name})
 
       add_dependencies(runtimes runtimes-${name})
@@ -488,7 +481,6 @@ if(runtimes)
           CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name}
                      -DLLVM_RUNTIMES_PREFIX=${name}/
                      -DLLVM_RUNTIMES_LIBDIR_SUBDIR=${multilib}
-                     ${EXTRA_CMAKE_ARGS}
           BASE_NAME ${name}
           EXTRA_ARGS TARGET_TRIPLE ${name})
 
diff --git a/llvm/test/Analysis/BasicAA/range.ll b/llvm/test/Analysis/BasicAA/range.ll
index 3862e26ee49a4b..e5dfb60c8b8784 100644
--- a/llvm/test/Analysis/BasicAA/range.ll
+++ b/llvm/test/Analysis/BasicAA/range.ll
@@ -2,6 +2,7 @@
 
 %struct.S = type { i32, [2 x i32], i32 }
 %struct.S2 = type { i32, [4 x i32], [4 x i32] }
+@G = global [10 x i32] zeroinitializer, align 4
 
 ; CHECK: Function: t1
 ; CHECK: NoAlias: i32* %gep1, i32* %gep2
@@ -258,8 +259,19 @@ join:
   ret void
 }
 
-declare void @llvm.assume(i1)
 
+; CHECK-LABEL: Function: select_in_gep
+; CHECK: NoAlias: i32* %arrayidx, i32* getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 3)
+define i32 @select_in_gep(i1 %c)  {
+entry:
+  %select_ = select i1 %c, i64 2, i64 1
+  %arrayidx = getelementptr inbounds [10 x i32], ptr @G, i64 0, i64 %select_
+  store i32 42, ptr %arrayidx, align 4
+  %load_ = load i32, ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 3), align 4
+  ret i32 %load_
+}
+
+declare void @llvm.assume(i1)
 
 !0 = !{ i32 0, i32 2 }
 !1 = !{ i32 0, i32 1 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index ee41c69baf2c8b..652d36c01a77e7 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -117,8 +117,14 @@ define void @scalable_ext_loads() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %zext.nxv16i8to64 = zext <vscale x 16 x i8> %load.nxv16i8.3 to <vscale x 16 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv8i8to16 = zext <vscale x 8 x i8> %load.nxv8i8 to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zext.nxv8i8to32 = zext <vscale x 8 x i8> %load.nxv8i8.2 to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %zext.nxv8i8to64 = zext <vscale x 8 x i8> %load.nxv8i8.3 to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv4i8to32 = zext <vscale x 4 x i8> %load.nxv4i8 to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zext.nxv4i8to64 = zext <vscale x 4 x i8> %load.nxv4i8.2 to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv2i8to64 = zext <vscale x 2 x i8> %load.nxv2i8 to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
@@ -127,6 +133,8 @@ define void @scalable_ext_loads() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %zext.nxv8i16to64 = zext <vscale x 8 x i16> %load.nxv8i16.2 to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv4i16to32 = zext <vscale x 4 x i16> %load.nxv4i16 to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zext.nxv4i16to64 = zext <vscale x 4 x i16> %load.nxv4i16.2 to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv2i16to64 = zext <vscale x 2 x i16> %load.nxv2i16 to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
@@ -141,8 +149,14 @@ define void @scalable_ext_loads() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %sext.nxv16i8to64 = sext <vscale x 16 x i8> %load2.nxv16i8.3 to <vscale x 16 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv8i8to16 = sext <vscale x 8 x i8> %load2.nxv8i8 to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext.nxv8i8to32 = sext <vscale x 8 x i8> %load2.nxv8i8.2 to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext.nxv8i8to64 = sext <vscale x 8 x i8> %load2.nxv8i8.3 to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv4i8to32 = sext <vscale x 4 x i8> %load2.nxv4i8 to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext.nxv4i8to64 = sext <vscale x 4 x i8> %load2.nxv4i8.2 to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv2i8to64 = sext <vscale x 2 x i8> %load2.nxv2i8 to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
@@ -151,6 +165,8 @@ define void @scalable_ext_loads() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext.nxv8i16to64 = sext <vscale x 8 x i16> %load2.nxv8i16.2 to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv4i16to32 = sext <vscale x 4 x i16> %load2.nxv4i16 to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext.nxv4i16to64 = sext <vscale x 4 x i16> %load2.nxv4i16.2 to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv2i16to64 = sext <vscale x 2 x i16> %load2.nxv2i16 to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
@@ -169,8 +185,14 @@ define void @scalable_ext_loads() {
   %zext.nxv16i8to64 = zext <vscale x 16 x i8> %load.nxv16i8.3 to <vscale x 16 x i64>
   %load.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
   %zext.nxv8i8to16 = zext <vscale x 8 x i8> %load.nxv8i8 to <vscale x 8 x i16>
+  %load.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+  %zext.nxv8i8to32 = zext <vscale x 8 x i8> %load.nxv8i8.2 to <vscale x 8 x i32>
+  %load.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+  %zext.nxv8i8to64 = zext <vscale x 8 x i8> %load.nxv8i8.3 to <vscale x 8 x i64>
   %load.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
   %zext.nxv4i8to32 = zext <vscale x 4 x i8> %load.nxv4i8 to <vscale x 4 x i32>
+  %load.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+  %zext.nxv4i8to64 = zext <vscale x 4 x i8> %load.nxv4i8.2 to <vscale x 4 x i64>
   %load.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
   %zext.nxv2i8to64 = zext <vscale x 2 x i8> %load.nxv2i8 to <vscale x 2 x i64>
   %load.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
@@ -179,6 +201,8 @@ define void @scalable_ext_loads() {
   %zext.nxv8i16to64 = zext <vscale x 8 x i16> %load.nxv8i16.2 to <vscale x 8 x i64>
   %load.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
   %zext.nxv4i16to32 = zext <vscale x 4 x i16> %load.nxv4i16 to <vscale x 4 x i32>
+  %load.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+  %zext.nxv4i16to64 = zext <vscale x 4 x i16> %load.nxv4i16.2 to <vscale x 4 x i64>
   %load.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
   %zext.nxv2i16to64 = zext <vscale x 2 x i16> %load.nxv2i16 to <vscale x 2 x i64>
   %load.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
@@ -194,8 +218,14 @@ define void @scalable_ext_loads() {
   %sext.nxv16i8to64 = sext <vscale x 16 x i8> %load2.nxv16i8.3 to <vscale x 16 x i64>
   %load2.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
   %sext.nxv8i8to16 = sext <vscale x 8 x i8> %load2.nxv8i8 to <vscale x 8 x i16>
+  %load2.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+  %sext.nxv8i8to32 = sext <vscale x 8 x i8> %load2.nxv8i8.2 to <vscale x 8 x i32>
+  %load2.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+  %sext.nxv8i8to64 = sext <vscale x 8 x i8> %load2.nxv8i8.3 to <vscale x 8 x i64>
   %load2.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
   %sext.nxv4i8to32 = sext <vscale x 4 x i8> %load2.nxv4i8 to <vscale x 4 x i32>
+  %load2.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+  %sext.nxv4i8to64 = sext <vscale x 4 x i8> %load2.nxv4i8.2 to <vscale x 4 x i64>
   %load2.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
   %sext.nxv2i8to64 = sext <vscale x 2 x i8> %load2.nxv2i8 to <vscale x 2 x i64>
   %load2.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
@@ -204,6 +234,8 @@ define void @scalable_ext_loads() {
   %sext.nxv8i16to64 = sext <vscale x 8 x i16> %load2.nxv8i16.2 to <vscale x 8 x i64>
   %load2.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
   %sext.nxv4i16to32 = sext <vscale x 4 x i16> %load2.nxv4i16 to <vscale x 4 x i32>
+  %load2.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+  %sext.nxv4i16to64 = sext <vscale x 4 x i16> %load2.nxv4i16.2 to <vscale x 4 x i64>
   %load2.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
   %sext.nxv2i16to64 = sext <vscale x 2 x i16> %load2.nxv2i16 to <vscale x 2 x i64>
   %load2.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
index 1979be7d50d2f4..cbb05620a92706 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
@@ -11,12 +11,20 @@ define void @sve_ext() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %zext_nxv4_i8_to_i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %zext_nxv8_i8_to_i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %zext_nxv8_i8_to_i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_nxv4_i8_to_i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_nxv8_i8_to_i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_nxv8_i8_to_i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
@@ -25,6 +33,10 @@ define void @sve_ext() {
   %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
   %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
   %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+  %zext_nxv4_i8_to_i64  = zext <vscale x 4 x i8>  undef to <vscale x 4 x i64>
+  %zext_nxv8_i8_to_i32  = zext <vscale x 8 x i8>  undef to <vscale x 8 x i32>
+  %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+  %zext_nxv8_i8_to_i64  = zext <vscale x 8 x i8>  undef to <vscale x 8 x i64>
 
   %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
   %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
@@ -32,6 +44,10 @@ define void @sve_ext() {
   %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
   %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
   %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+  %sext_nxv4_i8_to_i64  = sext <vscale x 4 x i8>  undef to <vscale x 4 x i64>
+  %sext_nxv8_i8_to_i32  = sext <vscale x 8 x i8>  undef to <vscale x 8 x i32>
+  %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+  %sext_nxv8_i8_to_i64  = sext <vscale x 8 x i8>  undef to <vscale x 8 x i64>
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
index 6ee04fb8dac5e5..3996eb40a2b218 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
@@ -53,6 +53,14 @@ define void @scalable_ext_loads() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %zext.nxv4i32to64 = zext <vscale x 4 x i32> %load.nxv4i32 to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv2i32 = load <vscale x 2 x i32>, ptr undef, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv2i32to64 = zext <vscale x 2 x i32> %load.nxv2i32 to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i8.2 = load <vscale x 4 x i8>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zext.nxv4i8to64 = zext <vscale x 4 x i8> %load.nxv4i8.2 to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i16.2 = load <vscale x 4 x i16>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zext.nxv4i16to64 = zext <vscale x 4 x i16> %load.nxv4i16.2 to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i8.2 = load <vscale x 8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zext.nxv8i8to32 = zext <vscale x 8 x i8> %load.nxv8i8.2 to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i8.3 = load <vscale x 8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %zext.nxv8i8to64 = zext <vscale x 8 x i8> %load.nxv8i8.3 to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv16i8 = load <vscale x 16 x i8>, ptr undef, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext.nxv16i8to16 = sext <vscale x 16 x i8> %load2.nxv16i8 to <vscale x 16 x i16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8 = load <vscale x 8 x i8>, ptr undef, align 8
@@ -71,6 +79,14 @@ define void @scalable_ext_loads() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext.nxv4i32to64 = sext <vscale x 4 x i32> %load2.nxv4i32 to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv2i32 = load <vscale x 2 x i32>, ptr undef, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv2i32to64 = sext <vscale x 2 x i32> %load2.nxv2i32 to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i8.2 = load <vscale x 4 x i8>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext.nxv4i8to64 = sext <vscale x 4 x i8> %load2.nxv4i8.2 to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i16.2 = load <vscale x 4 x i16>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext.nxv4i16to64 = sext <vscale x 4 x i16> %load2.nxv4i16.2 to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8.2 = load <vscale x 8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext.nxv8i8to32 = sext <vscale x 8 x i8> %load2.nxv8i8.2 to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8.3 = load <vscale x 8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext.nxv8i8to64 = sext <vscale x 8 x i8> %load2.nxv8i8.3 to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
@@ -93,6 +109,14 @@ define void @scalable_ext_loads() {
   %zext.nxv4i32to64 = zext <vscale x 4 x i32> %load.nxv4i32 to <vscale x 4 x i64>
   %load.nxv2i32 = load <vscale x 2 x i32>, ptr undef
   %zext.nxv2i32to64 = zext <vscale x 2 x i32> %load.nxv2i32 to <vscale x 2 x i64>
+  %load.nxv4i8.2 = load <vscale x 4 x i8>, ptr undef
+  %zext.nxv4i8to64 = zext <vscale x 4 x i8> %load.nxv4i8.2 to <vscale x 4 x i64>
+  %load.nxv4i16.2 = load <vscale x 4 x i16>, ptr undef
+  %zext.nxv4i16to64 = zext <vscale x 4 x i16> %load.nxv4i16.2 to <vscale x 4 x i64>
+  %load.nxv8i8.2 = load <vscale x 8 x i8>, ptr undef
+  %zext.nxv8i8to32 = zext <vscale x 8 x i8> %load.nxv8i8.2 to <vscale x 8 x i32>
+  %load.nxv8i8.3 = load <vscale x 8 x i8>, ptr undef
+  %zext.nxv8i8to64 = zext <vscale x 8 x i8> %load.nxv8i8.3 to <vscale x 8 x i64>
 
   %load2.nxv16i8 = load <vscale x 16 x i8>, ptr undef
   %sext.nxv16i8to16 = sext <vscale x 16 x i8> %load2.nxv16i8 to <vscale x 16 x i16>
@@ -112,6 +136,14 @@ define void @scalable_ext_loads() {
   %sext.nxv4i32to64 = sext <vscale x 4 x i32> %load2.nxv4i32 to <vscale x 4 x i64>
   %load2.nxv2i32 = load <vscale x 2 x i32>, ptr undef
   %sext.nxv2i32to64 = sext <vscale x 2 x i32> %load2.nxv2i32 to <vscale x 2 x i64>
+  %load2.nxv4i8.2 = load <vscale x 4 x i8>, ptr undef
+  %sext.nxv4i8to64 = sext <vscale x 4 x i8> %load2.nxv4i8.2 to <vscale x 4 x i64>
+  %load2.nxv4i16.2 = load <vscale x 4 x i16>, ptr undef
+  %sext.nxv4i16to64 = sext <vscale x 4 x i16> %load2.nxv4i16.2 to <vscale x 4 x i64>
+  %load2.nxv8i8.2 = load <vscale x 8 x i8>, ptr undef
+  %sext.nxv8i8to32 = sext <vscale x 8 x i8> %load2.nxv8i8.2 to <vscale x 8 x i32>
+  %load2.nxv8i8.3 = load <vscale x 8 x i8>, ptr undef
+  %sext.nxv8i8to64 = sext <vscale x 8 x i8> %load2.nxv8i8.3 to <vscale x 8 x i64>
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll
index be518faf7e0516..4fadf34c1973f8 100644
--- a/llvm/test/Analysis/CostModel/RISCV/gep.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll
@@ -270,7 +270,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x6 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %6, i64 undef, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = getelementptr i8, ptr %base, i32 42
@@ -282,7 +282,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr %12, i64 undef, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -340,7 +340,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %6 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x6 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %6, i64 undef, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %7 = getelementptr i8, ptr %base, i32 0
@@ -352,7 +352,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr %12, i64 undef, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index 93de623cf1c6da..85364c935267d2 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -206,10 +206,378 @@ define void @vp_fshl() {
   ret void
 }
 
+define void @add() {
+; CHECK-LABEL: 'add'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = add <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = add <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = add <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = add <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = add <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = add <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t14 = add <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t16 = add <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = add <vscale x 2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = add <vscale x 4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t22 = add <vscale x 8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = add <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = add <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = add <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = add <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = add <vscale x 16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+  %t1 = add <2 x i8> undef, undef
+  %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+  %t3 = add <4 x i8> undef, undef
+  %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+  %t5 = add <8 x i8> undef, undef
+  %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+  %t7 = add <16 x i8> undef, undef
+  %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+  %t9 = add <2 x i64> undef, undef
+  %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+  %t12 = add <4 x i64> undef, undef
+  %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+  %t14 = add <8 x i64> undef, undef
+  %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+  %t16 = add <16 x i64> undef, undef
+  %t17 = call <vscale x 2 x i8> @llvm.vp.add.nv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t18 = add <vscale x 2 x i8> undef, undef
+  %t19 = call <vscale x 4 x i8> @llvm.vp.add.nv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t20 = add <vscale x 4 x i8> undef, undef
+  %t21 = call <vscale x 8 x i8> @llvm.vp.add.nv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t22 = add <vscale x 8 x i8> undef, undef
+  %t23 = call <vscale x 16 x i8> @llvm.vp.add.nv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t24 = add <vscale x 16 x i8> undef, undef
+  %t25 = call <vscale x 2 x i64> @llvm.vp.add.nv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t26 = add <vscale x 2 x i64> undef, undef
+  %t27 = call <vscale x 4 x i64> @llvm.vp.add.nv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t28 = add <vscale x 4 x i64> undef, undef
+  %t29 = call <vscale x 8 x i64> @llvm.vp.add.nv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t30 = add <vscale x 8 x i64> undef, undef
+  %t31 = call <vscale x 16 x i64> @llvm.vp.add.nv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t32 = add <vscale x 16 x i64> undef, undef
+  ret void
+}
+
+define void @abs() {
+; CHECK-LABEL: 'abs'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 false, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 0, <2 x i1> undef, i32 undef)
+  call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 0)
+  call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 0, <4 x i1> undef, i32 undef)
+  call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 0)
+  call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 0, <8 x i1> undef, i32 undef)
+  call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 0)
+  call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 0, <16 x i1> undef, i32 undef)
+  call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 0)
+  call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 0, <2 x i1> undef, i32 undef)
+  call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 0)
+  call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 0, <4 x i1> undef, i32 undef)
+  call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 0)
+  call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 0, <8 x i1> undef, i32 undef)
+  call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 0)
+  call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 0, <16 x i1> undef, i32 undef)
+  call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 0)
+  call <vscale x 2 x i8> @llvm.vp.abs.nv2i8(<vscale x 2 x i8> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 2 x i8> @llvm.abs.nv2i8(<vscale x 2 x i8> undef, i1 0)
+  call <vscale x 4 x i8> @llvm.vp.abs.nv4i8(<vscale x 4 x i8> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 4 x i8> @llvm.abs.nv4i8(<vscale x 4 x i8> undef, i1 0)
+  call <vscale x 8 x i8> @llvm.vp.abs.nv8i8(<vscale x 8 x i8> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 8 x i8> @llvm.abs.nv8i8(<vscale x 8 x i8> undef, i1 0)
+  call <vscale x 16 x i8> @llvm.vp.abs.nv16i8(<vscale x 16 x i8> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 16 x i8> @llvm.abs.nv16i8(<vscale x 16 x i8> undef, i1 0)
+  call <vscale x 2 x i64> @llvm.vp.abs.nv2i64(<vscale x 2 x i64> undef, i1 0, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 2 x i64> @llvm.abs.nv2i64(<vscale x 2 x i64> undef, i1 0)
+  call <vscale x 4 x i64> @llvm.vp.abs.nv4i64(<vscale x 4 x i64> undef, i1 0, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 4 x i64> @llvm.abs.nv4i64(<vscale x 4 x i64> undef, i1 0)
+  call <vscale x 8 x i64> @llvm.vp.abs.nv8i64(<vscale x 8 x i64> undef, i1 0, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 8 x i64> @llvm.abs.nv8i64(<vscale x 8 x i64> undef, i1 0)
+  call <vscale x 16 x i64> @llvm.vp.abs.nv16i64(<vscale x 16 x i64> undef, i1 0, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 16 x i64> @llvm.abs.nv16i64(<vscale x 16 x i64> undef, i1 0)
+  ret void
+}
+
+define void @load() {
+; CHECK-LABEL: 'load'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = load <2 x i8>, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = load <4 x i8>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = load <8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = load <16 x i8>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = load <2 x i64>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = load <4 x i64>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t14 = load <8 x i64>, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t16 = load <16 x i64>, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = load <vscale x 2 x i8>, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = load <vscale x 4 x i8>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t22 = load <vscale x 8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = load <vscale x 16 x i8>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = load <vscale x 2 x i64>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = load <vscale x 4 x i64>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = load <vscale x 8 x i64>, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = load <vscale x 16 x i64>, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t0 = call <2 x i8> @llvm.vp.load.v2i8(ptr undef, <2 x i1> undef, i32 undef)
+  %t1 = load <2 x i8>, ptr undef
+  %t2 = call <4 x i8> @llvm.vp.load.v4i8(ptr undef, <4 x i1> undef, i32 undef)
+  %t3 = load <4 x i8>, ptr undef
+  %t4 = call <8 x i8> @llvm.vp.load.v8i8(ptr undef, <8 x i1> undef, i32 undef)
+  %t5 = load <8 x i8>, ptr undef
+  %t6 = call <16 x i8> @llvm.vp.load.v16i8(ptr undef, <16 x i1> undef, i32 undef)
+  %t7 = load <16 x i8>, ptr undef
+  %t8 = call <2 x i64> @llvm.vp.load.v2i64(ptr undef, <2 x i1> undef, i32 undef)
+  %t9 = load <2 x i64>, ptr undef
+  %t10 = call <4 x i64> @llvm.vp.load.v4i64(ptr undef, <4 x i1> undef, i32 undef)
+  %t12 = load <4 x i64>, ptr undef
+  %t13 = call <8 x i64> @llvm.vp.load.v8i64(ptr undef, <8 x i1> undef, i32 undef)
+  %t14 = load <8 x i64>, ptr undef
+  %t15 = call <16 x i64> @llvm.vp.load.v16i64(ptr undef, <16 x i1> undef, i32 undef)
+  %t16 = load <16 x i64>, ptr undef
+  %t17 = call <vscale x 2 x i8> @llvm.vp.load.nv2i8(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  %t18 = load <vscale x 2 x i8>, ptr undef
+  %t19 = call <vscale x 4 x i8> @llvm.vp.load.nv4i8(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  %t20 = load <vscale x 4 x i8>, ptr undef
+  %t21 = call <vscale x 8 x i8> @llvm.vp.load.nv8i8(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  %t22 = load <vscale x 8 x i8>, ptr undef
+  %t23 = call <vscale x 16 x i8> @llvm.vp.load.nv16i8(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  %t24 = load <vscale x 16 x i8>, ptr undef
+  %t25 = call <vscale x 2 x i64> @llvm.vp.load.nv2i64(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  %t26 = load <vscale x 2 x i64>, ptr undef
+  %t27 = call <vscale x 4 x i64> @llvm.vp.load.nv4i64(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  %t28 = load <vscale x 4 x i64>, ptr undef
+  %t29 = call <vscale x 8 x i64> @llvm.vp.load.nv8i64(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  %t30 = load <vscale x 8 x i64>, ptr undef
+  %t31 = call <vscale x 16 x i64> @llvm.vp.load.nv16i64(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  %t32 = load <vscale x 16 x i64>, ptr undef
+  ret void
+}
+
+define void @store() {
+; CHECK-LABEL: 'store'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v4i8.p0(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.v16i64.p0(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv2i8.p0(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i8> undef, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i8> undef, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i8> undef, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 2 x i64> undef, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 4 x i64> undef, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <vscale x 8 x i64> undef, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.nxv16i64.p0(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 16 x i64> undef, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.vp.store.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
+  store <2 x i8> undef, ptr undef
+  call void @llvm.vp.store.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
+  store <4 x i8> undef, ptr undef
+  call void @llvm.vp.store.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef)
+  store <8 x i8> undef, ptr undef
+  call void @llvm.vp.store.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef)
+  store <16 x i8> undef, ptr undef
+  call void @llvm.vp.store.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef)
+  store <2 x i64> undef, ptr undef
+  call void @llvm.vp.store.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef)
+  store <4 x i64> undef, ptr undef
+  call void @llvm.vp.store.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef)
+  store <8 x i64> undef, ptr undef
+  call void @llvm.vp.store.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
+  store <16 x i64> undef, ptr undef
+  call void @llvm.vp.store.nv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  store <vscale x 2 x i8> undef, ptr undef
+  call void @llvm.vp.store.nv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  store <vscale x 4 x i8> undef, ptr undef
+  call void @llvm.vp.store.nv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  store <vscale x 8 x i8> undef, ptr undef
+  call void @llvm.vp.store.nv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  store <vscale x 16 x i8> undef, ptr undef
+  call void @llvm.vp.store.nv2i64(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  store <vscale x 2 x i64> undef, ptr undef
+  call void @llvm.vp.store.nv4i64(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  store <vscale x 4 x i64> undef, ptr undef
+  call void @llvm.vp.store.nv8i64(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  store <vscale x 8 x i64> undef, ptr undef
+  call void @llvm.vp.store.nv16i64(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  store <vscale x 16 x i64> undef, ptr undef
+  ret void
+}
+
+declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.add.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.add.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.add.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.add.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.add.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.add.nv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.add.nv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.add.nv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.add.nv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.add.nv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.add.nv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.add.nv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.add.nv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+
+declare <2 x i8> @llvm.vp.abs.v2i8(<2 x i8>, i1, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.abs.v4i8(<4 x i8>, i1, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.abs.v8i8(<8 x i8>, i1, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.abs.v16i8(<16 x i8>, i1, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.abs.v2i64(<2 x i64>, i1, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.abs.v4i64(<4 x i64>, i1, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.abs.v8i64(<8 x i64>, i1, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.abs.v16i64(<16 x i64>, i1, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.abs.nv2i8(<vscale x 2 x i8>, i1, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.abs.nv4i8(<vscale x 4 x i8>, i1, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.abs.nv8i8(<vscale x 8 x i8>, i1, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.abs.nv16i8(<vscale x 16 x i8>, i1, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.abs.nv2i64(<vscale x 2 x i64>, i1, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.abs.nv4i64(<vscale x 4 x i64>, i1, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.abs.nv8i64(<vscale x 8 x i64>, i1, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.abs.nv16i64(<vscale x 16 x i64>, i1, <vscale x 16 x i1>, i32)
+
+declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
+declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1)
+declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1)
+declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
+declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
+declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
+declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
+declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
+declare <vscale x 2 x i8> @llvm.abs.nv2i8(<vscale x 2 x i8>, i1)
+declare <vscale x 4 x i8> @llvm.abs.nv4i8(<vscale x 4 x i8>, i1)
+declare <vscale x 8 x i8> @llvm.abs.nv8i8(<vscale x 8 x i8>, i1)
+declare <vscale x 16 x i8> @llvm.abs.nv16i8(<vscale x 16 x i8>, i1)
+declare <vscale x 2 x i64> @llvm.abs.nv2i64(<vscale x 2 x i64>, i1)
+declare <vscale x 4 x i64> @llvm.abs.nv4i64(<vscale x 4 x i64>, i1)
+declare <vscale x 8 x i64> @llvm.abs.nv8i64(<vscale x 8 x i64>, i1)
+declare <vscale x 16 x i64> @llvm.abs.nv16i64(<vscale x 16 x i64>, i1)
+
+declare <2 x i8> @llvm.vp.load.v2i8(ptr, <2 x i1>, i32)
+declare <4 x i8> @llvm.vp.load.v4i8(ptr, <4 x i1>, i32)
+declare <8 x i8> @llvm.vp.load.v8i8(ptr, <8 x i1>, i32)
+declare <16 x i8> @llvm.vp.load.v16i8(ptr, <16 x i1>, i32)
+declare <2 x i64> @llvm.vp.load.v2i64(ptr, <2 x i1>, i32)
+declare <4 x i64> @llvm.vp.load.v4i64(ptr, <4 x i1>, i32)
+declare <8 x i64> @llvm.vp.load.v8i64(ptr, <8 x i1>, i32)
+declare <16 x i64> @llvm.vp.load.v16i64(ptr, <16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.load.nv2i8(ptr, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.load.nv4i8(ptr, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.load.nv8i8(ptr, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.load.nv16i8(ptr, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.load.nv2i64(ptr, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.load.nv4i64(ptr, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.load.nv8i64(ptr, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i64> @llvm.vp.load.nv16i64(ptr, <vscale x 16 x i1>, i32)
+
+declare void @llvm.vp.store.v2i8(<2 x i8>, ptr, <2 x i1>, i32)
+declare void @llvm.vp.store.v4i8(<4 x i8>, ptr, <4 x i1>, i32)
+declare void @llvm.vp.store.v8i8(<8 x i8>, ptr, <8 x i1>, i32)
+declare void @llvm.vp.store.v16i8(<16 x i8>, ptr, <16 x i1>, i32)
+declare void @llvm.vp.store.v2i64(<2 x i64>, ptr, <2 x i1>, i32)
+declare void @llvm.vp.store.v4i64(<4 x i64>, ptr, <4 x i1>, i32)
+declare void @llvm.vp.store.v8i64(<8 x i64>, ptr, <8 x i1>, i32)
+declare void @llvm.vp.store.v16i64(<16 x i64>, ptr, <16 x i1>, i32)
+declare void @llvm.vp.store.nv2i8(<vscale x 2 x i8>, ptr, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nv4i8(<vscale x 4 x i8>, ptr, <vscale x 4 x i1>, i32)
+declare void @llvm.vp.store.nv8i8(<vscale x 8 x i8>, ptr, <vscale x 8 x i1>, i32)
+declare void @llvm.vp.store.nv16i8(<vscale x 16 x i8>, ptr, <vscale x 16 x i1>, i32)
+declare void @llvm.vp.store.nv2i64(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nv4i64(<vscale x 4 x i64>, ptr, <vscale x 4 x i1>, i32)
+declare void @llvm.vp.store.nv8i64(<vscale x 8 x i64>, ptr, <vscale x 8 x i1>, i32)
+declare void @llvm.vp.store.nv16i64(<vscale x 16 x i64>, ptr, <vscale x 16 x i1>, i32)
+
 declare <vscale x 1 x i32> @llvm.fshr.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
 declare <vscale x 1 x i32> @llvm.fshl.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
 
-
 declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float>, i32)
 declare <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float>)
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-codesize.ll
index c2454855ae4435..a149ec45c863e3 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-codesize.ll
@@ -150,6 +150,96 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
   ret void
 }
 
+define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
+; SSE2-LABEL: 'test_vXf16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+  ret void
+}
+
+define void @test_vXbf16(<2 x bfloat> %src32, <4 x bfloat> %src64, <8 x bfloat> %src128, <16 x bfloat> %src256, <32 x bfloat> %src512) {
+; SSE-LABEL: 'test_vXbf16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXbf16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXbf16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXbf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V32  = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+  ret void
+}
+
 define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-latency.ll
index 7a5e91bb80561a..119bce9410d5e7 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-latency.ll
@@ -150,6 +150,96 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
   ret void
 }
 
+define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
+; SSE2-LABEL: 'test_vXf16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+  ret void
+}
+
+define void @test_vXbf16(<2 x bfloat> %src32, <4 x bfloat> %src64, <8 x bfloat> %src128, <16 x bfloat> %src256, <32 x bfloat> %src512) {
+; SSE-LABEL: 'test_vXbf16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXbf16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXbf16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXbf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V32  = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+  ret void
+}
+
 define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-sizelatency.ll
index 34754f8fba2f5e..b182738aa51869 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-sizelatency.ll
@@ -150,6 +150,96 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
   ret void
 }
 
+define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
+; SSE2-LABEL: 'test_vXf16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+  ret void
+}
+
+define void @test_vXbf16(<2 x bfloat> %src32, <4 x bfloat> %src64, <8 x bfloat> %src128, <16 x bfloat> %src256, <32 x bfloat> %src512) {
+; SSE-LABEL: 'test_vXbf16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXbf16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXbf16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXbf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V32  = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+  ret void
+}
+
 define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll
index 6de1c631ec643a..7614313bdb37ea 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll
@@ -150,6 +150,96 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
   ret void
 }
 
+define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
+; SSE2-LABEL: 'test_vXf16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+  ret void
+}
+
+define void @test_vXbf16(<2 x bfloat> %src32, <4 x bfloat> %src64, <8 x bfloat> %src128, <16 x bfloat> %src256, <32 x bfloat> %src512) {
+; SSE-LABEL: 'test_vXbf16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXbf16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXbf16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXbf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V32  = shufflevector <2 x bfloat> %src32, <2 x bfloat> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x bfloat> %src64, <4 x bfloat> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x bfloat> %src128, <8 x bfloat> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x bfloat> %src256, <16 x bfloat> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x bfloat> %src512, <32 x bfloat> undef, <32 x i32> zeroinitializer
+  ret void
+}
+
 define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
diff --git a/llvm/test/Bitcode/Inputs/constexpr-to-instr-metadata-2.bc b/llvm/test/Bitcode/Inputs/constexpr-to-instr-metadata-2.bc
new file mode 100644
index 00000000000000..732e60110cfc4b
Binary files /dev/null and b/llvm/test/Bitcode/Inputs/constexpr-to-instr-metadata-2.bc differ
diff --git a/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll b/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll
new file mode 100644
index 00000000000000..24938989e21b15
--- /dev/null
+++ b/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-dis -expand-constant-exprs < %S/Inputs/constexpr-to-instr-metadata-2.bc | FileCheck %s
+
+; CHECK-LABEL: define void @_ZN4alsa3pcm3PCM17hw_params_current17hf1c237aece2f69c4E() {
+; CHECK: call void @llvm.dbg.value(metadata ptr undef, metadata !4, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg !14
+
+; CHECK-LABEL: define void @_ZN4alsa3pcm8HwParams3any17h02a64cfc85ce8a66E() {
+; CHECK: call void @llvm.dbg.value(metadata ptr undef, metadata !23, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg !28
diff --git a/llvm/test/Bitcode/constexpr-to-instr-metadata.ll b/llvm/test/Bitcode/constexpr-to-instr-metadata.ll
index 78b5e30dd3154b..39a1b2687ae86b 100644
--- a/llvm/test/Bitcode/constexpr-to-instr-metadata.ll
+++ b/llvm/test/Bitcode/constexpr-to-instr-metadata.ll
@@ -1,12 +1,4 @@
 ; RUN: llvm-dis -expand-constant-exprs < %S/Inputs/constexpr-to-instr-metadata.bc | FileCheck %s
 
 ; CHECK-LABEL: define void @test() {
-; CHECK: %constexpr = ptrtoint ptr @g to i32
-; CHECK: %constexpr1 = zext i32 %constexpr to i64
-; CHECK: %constexpr2 = ptrtoint ptr @g to i64
-; CHECK: %constexpr3 = lshr i64 %constexpr2, 32
-; CHECK: %constexpr4 = trunc i64 %constexpr3 to i32
-; CHECK: %constexpr5 = zext i32 %constexpr4 to i64
-; CHECK: %constexpr6 = shl i64 %constexpr5, 32
-; CHECK: %constexpr7 = or i64 %constexpr1, %constexpr6
-; CHECK: call void @llvm.dbg.value(metadata i64 %constexpr7, metadata !4, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg !13
+; CHECK: call void @llvm.dbg.value(metadata i64 undef, metadata !4, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg !13
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
index 1986e79b3cec93..7badf4732fd0d4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
@@ -39,8 +39,7 @@ entry:
 define <4 x half> @test_v4s16(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_v4s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    dup v1.4h, v1.h[0]
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    fmax v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    ret
 entry:
@@ -52,8 +51,7 @@ entry:
 define <8 x half> @test_v8s16(<8 x half> %a) #0 {
 ; CHECK-LABEL: test_v8s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    dup v1.8h, v1.h[0]
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    fmax v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
 entry:
@@ -65,8 +63,7 @@ entry:
 define <2 x float> @test_v2s32(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_v2s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    dup v1.2s, v1.s[0]
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    fmax v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    ret
 entry:
@@ -78,8 +75,7 @@ entry:
 define <4 x float> @test_v4s32(<4 x float> %a) #0 {
 ; CHECK-LABEL: test_v4s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    dup v1.4s, v1.s[0]
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    fmax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ret
 entry:
@@ -91,8 +87,7 @@ entry:
 define <2 x double> @test_v2s64(<2 x double> %a) #0 {
 ; CHECK-LABEL: test_v2s64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    dup v1.2d, v1.d[0]
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    fmax v0.2d, v1.2d, v0.2d
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-saturating-arithmetic.ll b/llvm/test/CodeGen/AArch64/aarch64-saturating-arithmetic.ll
index 06cb13429b5732..db79b672d6f034 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-saturating-arithmetic.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-saturating-arithmetic.ll
@@ -61,10 +61,7 @@ define i64 @test_ssub_nonneg_lhs(i64 %x) {
 define i64 @test_ssub_neg_lhs(i64 %x) {
 ; CHECK-LABEL: test_ssub_neg_lhs:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov x9, #-9223372036854775808 // =0x8000000000000000
-; CHECK-NEXT:    subs x8, x8, x0
-; CHECK-NEXT:    csel x0, x9, x8, vs
+; CHECK-NEXT:    mvn x0, x0
 ; CHECK-NEXT:    ret
   %sat = call i64 @llvm.ssub.sat.i64(i64 -1, i64 %x)
   ret i64 %sat
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
index 74fc7b317708e6..1a9c5974a547a8 100644
--- a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
@@ -1,10 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -codegenprepare -mtriple=arm64_32-apple-ios %s -S -o - | FileCheck %s
 
 define void @test_simple_sink(ptr %base, i64 %offset) {
-; CHECK-LABEL: @test_simple_sink
-; CHECK: next:
-; CHECK:   [[ADDR8:%.*]] = getelementptr i8, ptr %base, i64 %offset
-; CHECK:   load volatile i1, ptr [[ADDR8]]
+; CHECK-LABEL: define void @test_simple_sink(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i1, ptr [[BASE]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
+; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    ret void
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
   %addr = getelementptr i1, ptr %base, i64 %offset
   %tst = load i1, ptr %addr
   br i1 %tst, label %next, label %end
@@ -18,10 +27,18 @@ end:
 }
 
 define void @test_inbounds_sink(ptr %base, i64 %offset) {
-; CHECK-LABEL: @test_inbounds_sink
-; CHECK: next:
-; CHECK:   [[ADDR8:%.*]] = getelementptr inbounds i8, ptr %base, i64 %offset
-; CHECK:   load volatile i1, ptr [[ADDR8]]
+; CHECK-LABEL: define void @test_inbounds_sink(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i1, ptr [[BASE]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
+; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    ret void
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
   %addr = getelementptr inbounds i1, ptr %base, i64 %offset
   %tst = load i1, ptr %addr
   br i1 %tst, label %next, label %end
@@ -36,10 +53,20 @@ end:
 
 ; No address derived via an add can be guaranteed inbounds
 define void @test_add_sink(ptr %base, i64 %offset) {
-; CHECK-LABEL: @test_add_sink
-; CHECK: next:
-; CHECK:   [[ADDR8:%.*]] = getelementptr i8, ptr %base, i64 %offset
-; CHECK:   load volatile i1, ptr [[ADDR8]]
+; CHECK-LABEL: define void @test_add_sink(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[BASE64:%.*]] = ptrtoint ptr [[BASE]] to i64
+; CHECK-NEXT:    [[ADDR64:%.*]] = add nuw nsw i64 [[BASE64]], [[OFFSET]]
+; CHECK-NEXT:    [[ADDR:%.*]] = inttoptr i64 [[ADDR64]] to ptr
+; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
+; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    ret void
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
   %base64 = ptrtoint ptr %base to i64
   %addr64 = add nsw nuw i64 %base64, %offset
   %addr = inttoptr i64 %addr64 to ptr
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign-noneon.ll
similarity index 100%
rename from llvm/test/CodeGen/AArch64/fcopysign.ll
rename to llvm/test/CodeGen/AArch64/fcopysign-noneon.ll
diff --git a/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll b/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
new file mode 100644
index 00000000000000..73b180dc4ae765
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+
+; Check there's no assert in spilling from implicit-def operands on an
+; IMPLICIT_DEF.
+
+define void @widget(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %arg5, i1 %arg6) {
+; CHECK-LABEL: widget:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    sub sp, sp, #144
+; CHECK-NEXT:    stp x28, x27, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #80] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #96] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #112] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #128] ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 144
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset w19, -24
+; CHECK-NEXT:    .cfi_offset w20, -32
+; CHECK-NEXT:    .cfi_offset w21, -40
+; CHECK-NEXT:    .cfi_offset w22, -48
+; CHECK-NEXT:    .cfi_offset w23, -56
+; CHECK-NEXT:    .cfi_offset w24, -64
+; CHECK-NEXT:    .cfi_offset w25, -72
+; CHECK-NEXT:    .cfi_offset w26, -80
+; CHECK-NEXT:    .cfi_offset w27, -88
+; CHECK-NEXT:    .cfi_offset w28, -96
+; CHECK-NEXT:    mov w19, w6
+; CHECK-NEXT:    mov w20, w5
+; CHECK-NEXT:    mov x21, x4
+; CHECK-NEXT:    mov x22, x3
+; CHECK-NEXT:    mov x23, x2
+; CHECK-NEXT:    mov w24, w1
+; CHECK-NEXT:    mov w25, w0
+; CHECK-NEXT:    mov w26, w1
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x27, LJTI0_0@PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    add x27, x27, LJTI0_0@PAGEOFF
+; CHECK-NEXT:    mov w28, #1 ; =0x1
+; CHECK-NEXT:    ; implicit-def: $w8
+; CHECK-NEXT:    str x8, [sp, #40] ; 8-byte Folded Spill
+; CHECK-NEXT:    b LBB0_2
+; CHECK-NEXT:  LBB0_1: ; %bb10
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    mov x1, x22
+; CHECK-NEXT:    str wzr, [x21]
+; CHECK-NEXT:    bl _putc
+; CHECK-NEXT:    mov w0, w25
+; CHECK-NEXT:    mov x1, xzr
+; CHECK-NEXT:    bl _putc
+; CHECK-NEXT:  LBB0_2: ; %bb8
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmp w24, #39
+; CHECK-NEXT:    b.hi LBB0_5
+; CHECK-NEXT:  ; %bb.3: ; %bb8
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    adr x8, LBB0_1
+; CHECK-NEXT:    ldrb w9, [x27, x26]
+; CHECK-NEXT:    add x8, x8, x9, lsl #2
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  LBB0_4: ; %bb9
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    str w28, [x23]
+; CHECK-NEXT:    b LBB0_2
+; CHECK-NEXT:  LBB0_5: ; %bb8
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    cmp w24, #64
+; CHECK-NEXT:    b.ne LBB0_2
+; CHECK-NEXT:    b LBB0_9
+; CHECK-NEXT:  LBB0_6: ; %bb13
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    str x8, [sp, #40] ; 8-byte Folded Spill
+; CHECK-NEXT:    tbz w19, #0, LBB0_2
+; CHECK-NEXT:  ; %bb.7: ; %bb14
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    mov x1, xzr
+; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    stp xzr, xzr, [sp]
+; CHECK-NEXT:    stp x8, xzr, [sp, #16]
+; CHECK-NEXT:    bl _fprintf
+; CHECK-NEXT:    b LBB0_2
+; CHECK-NEXT:  LBB0_8: ; %bb12
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    b LBB0_8
+; CHECK-NEXT:  LBB0_9: ; %bb16
+; CHECK-NEXT:    ldr x8, [sp, #40] ; 8-byte Folded Reload
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    mov x1, xzr
+; CHECK-NEXT:    ; kill: def $w8 killed $w8 killed $x8 def $x8
+; CHECK-NEXT:    str x8, [sp]
+; CHECK-NEXT:    bl _fprintf
+; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:    .loh AdrpAdd Lloh0, Lloh1
+; CHECK-NEXT:    .cfi_endproc
+; CHECK-NEXT:    .section __TEXT,__const
+; CHECK-NEXT:  LJTI0_0:
+; CHECK-NEXT:    .byte (LBB0_6-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_8-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_4-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_1-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_2-LBB0_1)>>2
+; CHECK-NEXT:    .byte (LBB0_1-LBB0_1)>>2
+bb:
+  br label %bb7
+
+bb7:                                              ; preds = %bb14, %bb13, %bb
+  %phi = phi i32 [ poison, %bb ], [ %mul, %bb14 ], [ %mul, %bb13 ]
+  br label %bb8
+
+bb8:                                              ; preds = %bb10, %bb9, %bb8, %bb7
+  switch i32 %arg1, label %bb8 [
+    i32 10, label %bb9
+    i32 64, label %bb16
+    i32 0, label %bb13
+    i32 39, label %bb10
+    i32 34, label %bb10
+    i32 1, label %bb12
+  ]
+
+bb9:                                              ; preds = %bb8
+  store i32 1, ptr %arg2, align 4
+  br label %bb8
+
+bb10:                                             ; preds = %bb8, %bb8
+  store i32 0, ptr %arg4, align 4
+  %call = tail call i32 @putc(i32 %arg5, ptr %arg3)
+  %call11 = tail call i32 @putc(i32 %arg, ptr null)
+  br label %bb8
+
+bb12:                                             ; preds = %bb12, %bb8
+  br label %bb12
+
+bb13:                                             ; preds = %bb8
+  %mul = mul i32 1, 1
+  br i1 %arg6, label %bb14, label %bb7
+
+bb14:                                             ; preds = %bb13
+  %call15 = tail call i32 (ptr, ptr, ...) @fprintf(ptr null, ptr null, ptr null, i32 0, i32 %mul, ptr null)
+  br label %bb7
+
+bb16:                                             ; preds = %bb8
+  %call17 = tail call i32 (ptr, ptr, ...) @fprintf(ptr null, ptr null, i32 %phi)
+  unreachable
+}
+
+declare i32 @fprintf(ptr, ptr, ...)
+declare i32 @putc(i32, ptr)
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
index 9c115a1de0db16..7d5a7e247087fc 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
@@ -67,7 +67,7 @@ tracksRegLiveness: true
 body:             |
   bb.0 (%ir-block.0):
   liveins: $lr
-    frame-setup PAUTH_PROLOGUE
+    frame-setup PAUTH_PROLOGUE implicit-def $lr, implicit $lr, implicit $sp
     $sp = frame-setup SUBXri $sp, 16, 0
     renamable $x8 = ADRP target-flags(aarch64-page) @v
     $x9 = ADDXri $sp, 12, 0
@@ -79,7 +79,7 @@ body:             |
     STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store (s64) into @v)
     STRXui killed renamable $x9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store (s64) into @v)
     $sp = frame-destroy ADDXri $sp, 16, 0
-    frame-destroy PAUTH_EPILOGUE
+    frame-destroy PAUTH_EPILOGUE implicit-def $lr, implicit $lr, implicit $sp
     RET undef $lr
 
 # CHECK-LABEL:    name:            legal0
@@ -100,7 +100,7 @@ tracksRegLiveness: true
 body:             |
   bb.0 (%ir-block.0):
   liveins: $lr
-    frame-setup PAUTH_PROLOGUE
+    frame-setup PAUTH_PROLOGUE implicit-def $lr, implicit $lr, implicit $sp
     $sp = frame-setup SUBXri $sp, 16, 0
     renamable $x8 = ADRP target-flags(aarch64-page) @v
     $x9 = ADDXri $sp, 12, 0
@@ -112,7 +112,7 @@ body:             |
     STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store (s64) into @v)
     STRXui killed renamable $x9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store (s64) into @v)
     $sp = frame-destroy ADDXri $sp, 16, 0
-    frame-destroy PAUTH_EPILOGUE
+    frame-destroy PAUTH_EPILOGUE implicit-def $lr, implicit $lr, implicit $sp
     RET undef $lr
 
 # CHECK-LABEL:    name:            legal1
@@ -133,7 +133,7 @@ tracksRegLiveness: true
 body:             |
   bb.0 (%ir-block.0):
   liveins: $lr
-    frame-setup PAUTH_PROLOGUE
+    frame-setup PAUTH_PROLOGUE implicit-def $lr, implicit $lr, implicit $sp
     $sp = frame-setup SUBXri $sp, 16, 0
     renamable $x8 = ADRP target-flags(aarch64-page) @v
     $x9 = ADDXri $sp, 12, 0
@@ -145,7 +145,7 @@ body:             |
     STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store (s64) into @v)
     STRXui killed renamable $x9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store (s64) into @v)
     $sp = frame-destroy ADDXri $sp, 12, 0
-    frame-destroy PAUTH_EPILOGUE
+    frame-destroy PAUTH_EPILOGUE implicit-def $lr, implicit $lr, implicit $sp
     RET undef $lr
 
 ...
@@ -155,7 +155,7 @@ tracksRegLiveness: true
 body:             |
   bb.0 (%ir-block.0):
   liveins: $lr
-    frame-setup PAUTH_PROLOGUE
+    frame-setup PAUTH_PROLOGUE implicit-def $lr, implicit $lr, implicit $sp
     $sp = frame-setup SUBXri $sp, 16, 0
     renamable $x8 = ADRP target-flags(aarch64-page) @v
     $x9 = ADDXri $sp, 12, 0
@@ -167,7 +167,7 @@ body:             |
     STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store (s64) into @v)
     STRXui killed renamable $x9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store (s64) into @v)
     $sp = frame-destroy ADDXri $sp, 12, 0
-    frame-destroy PAUTH_EPILOGUE
+    frame-destroy PAUTH_EPILOGUE implicit-def $lr, implicit $lr, implicit $sp
     RET undef $lr
 
 # CHECK-LABEL:    name:            illegal0
diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
index 48242d99a6002a..6ad98339e14773 100644
--- a/llvm/test/CodeGen/AArch64/neon-mov.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -305,44 +305,26 @@ define <1 x i64> @movid() {
 }
 
 define <2 x float> @fmov2s() {
-; CHECK-SD-LABEL: fmov2s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov v0.2s, #-12.00000000
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fmov2s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s0, #-12.00000000
-; CHECK-GI-NEXT:    dup v0.2s, v0.s[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fmov2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov v0.2s, #-12.00000000
+; CHECK-NEXT:    ret
 	ret <2 x float> < float -1.2e1, float -1.2e1>
 }
 
 define <4 x float> @fmov4s() {
-; CHECK-SD-LABEL: fmov4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov v0.4s, #-12.00000000
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fmov4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s0, #-12.00000000
-; CHECK-GI-NEXT:    dup v0.4s, v0.s[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fmov4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov v0.4s, #-12.00000000
+; CHECK-NEXT:    ret
 	ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1>
 }
 
 define <2 x double> @fmov2d() {
-; CHECK-SD-LABEL: fmov2d:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov v0.2d, #-12.00000000
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fmov2d:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov d0, #-12.00000000
-; CHECK-GI-NEXT:    dup v0.2d, v0.d[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fmov2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov v0.2d, #-12.00000000
+; CHECK-NEXT:    ret
 	ret <2 x double> < double -1.2e1, double -1.2e1>
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sched-loop-align.ll b/llvm/test/CodeGen/AArch64/sched-loop-align.ll
new file mode 100644
index 00000000000000..5b8e42c2790a43
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sched-loop-align.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=aarch64-windows | FileCheck %s --check-prefix=WINDOWS
+; RUN: llc < %s -mtriple=aarch64-linux | FileCheck %s --check-prefix=LINUX
+
+define dso_local void @b() #0 {
+entry:
+  br label %for.cond
+
+for.cond:
+  tail call void @a()
+  br label %for.cond
+}
+
+declare dso_local void @a(...)
+
+attributes #0 = { noreturn nounwind uwtable "tune-cpu"="cortex-a53" }
+
+; LINUX-LABEL: b:
+; LINUX: .p2align 4
+
+; WINDOWS-LABEL: b:
+; WINDOWS-NOT: .p2align
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold-dbg-value-crash.mir b/llvm/test/CodeGen/AArch64/sink-and-fold-dbg-value-crash.mir
index 0fc645022780d6..82dae8a86ea2b7 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold-dbg-value-crash.mir
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold-dbg-value-crash.mir
@@ -26,7 +26,16 @@
     ret void
   }
 
-  declare ptr @g(ptr)
+  define ptr @g(ptr) {
+  entry:
+    br label %if.then
+  if.then:
+    br label %if.end
+  if.end:
+    br label %exit
+  exit:
+    ret ptr null
+  }
 
   declare void @llvm.dbg.value(metadata, metadata, metadata) #0
 
@@ -170,3 +179,126 @@ body:             |
     RET_ReallyLR
 
 ...
+---
+name:            g
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: gpr64all, preferred-register: '' }
+  - { id: 1, class: gpr64common, preferred-register: '' }
+  - { id: 2, class: gpr32, preferred-register: '' }
+  - { id: 3, class: gpr32, preferred-register: '' }
+  - { id: 4, class: gpr32, preferred-register: '' }
+  - { id: 5, class: gpr64sp, preferred-register: '' }
+  - { id: 6, class: gpr64all, preferred-register: '' }
+liveins:
+  - { reg: '$x0', virtual-reg: '%1' }
+  - { reg: '$w1', virtual-reg: '%2' }
+  - { reg: '$w2', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: g
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0, $w1, $w2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr64common = COPY $x0
+  ; CHECK-NEXT:   DBG_VALUE_LIST !4, !DIExpression(), [[COPY]], $noreg, debug-location !10
+  ; CHECK-NEXT:   TBZW [[COPY1]], 0, %bb.2
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.then:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY]]
+  ; CHECK-NEXT:   DBG_VALUE_LIST !4, !DIExpression(), [[COPY3]], $noreg, debug-location !10
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   $x0 = ADDXri [[COPY2]], 8, 0
+  ; CHECK-NEXT:   DBG_VALUE_LIST !4, !DIExpression(), [[COPY3]], $x0, debug-location !10
+  ; CHECK-NEXT:   BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   TBNZW [[COPY3]], 0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.end:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.exit:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $x0, $w1, $w2
+
+    %3:gpr32 = COPY $w2
+    %2:gpr32 = COPY $w1
+    %1:gpr64common = COPY $x0
+    %4:gpr32 = COPY %3
+    %5:gpr64sp = ADDXri %1, 8, 0
+    DBG_VALUE_LIST !4, !DIExpression(), %4:gpr32, %5:gpr64sp, debug-location !10
+    %0:gpr64all = COPY %5
+    TBZW %2, 0, %bb.2
+    B %bb.1
+
+  bb.1.if.then:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $x0 = COPY %0
+    BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    TBNZW %4, 0, %bb.3
+    B %bb.2
+
+  bb.2.if.end:
+    successors: %bb.3(0x80000000)
+
+
+  bb.3.exit:
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 632fdb39105312..52007221e12a7b 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-enable-sink-fold=true < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 target triple = "aarch64-linux"
 
 declare i32 @use(...)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
index 24ece2873adb85..6871fd53fa6ad3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
@@ -194,28 +194,13 @@ define void @test_revhv32i16(ptr %a) #0 {
 define void @test_rev_elts_fail(ptr %a) #1 {
 ; CHECK-LABEL: test_rev_elts_fail:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    sub x9, sp, #48
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    adrp x8, .LCPI11_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI11_0
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d[2]
-; CHECK-NEXT:    mov z2.d, z0.d[3]
-; CHECK-NEXT:    mov x9, v0.d[1]
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    fmov x10, d2
-; CHECK-NEXT:    stp x10, x8, [sp, #16]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    stp x9, x8, [sp]
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x8]
+; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -260,39 +245,26 @@ define void @test_revdv4f64_sve2p1(ptr %a) #2 {
 
 ; sve-vector-bits-min=256, sve-vector-bits-max is not set, REV inst can't be generated.
 define void @test_revv8i32(ptr %a) #0 {
-; CHECK-LABEL: test_revv8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    sub x9, sp, #48
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    mov w9, v0.s[2]
-; CHECK-NEXT:    mov w10, v0.s[3]
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    mov z1.s, z0.s[4]
-; CHECK-NEXT:    mov z2.s, z0.s[5]
-; CHECK-NEXT:    mov z3.s, z0.s[6]
-; CHECK-NEXT:    mov z0.s, z0.s[7]
-; CHECK-NEXT:    stp w8, w11, [sp, #24]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    stp w10, w9, [sp, #16]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    stp w9, w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    stp w9, w8, [sp]
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; VBITS_GE_256-LABEL: test_revv8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    index z0.s, #7, #-1
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    tbl z0.s, { z1.s }, z0.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
+;
+; VBITS_GE_512-LABEL: test_revv8i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    adrp x8, .LCPI14_0
+; VBITS_GE_512-NEXT:    add x8, x8, :lo12:.LCPI14_0
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p1/z, [x8]
+; VBITS_GE_512-NEXT:    tbl z0.s, { z0.s }, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   store <8 x i32> %tmp2, ptr %a
@@ -379,60 +351,13 @@ define void @test_revv8i32v8i32(ptr %a, ptr %b) #1 {
 define void @test_rev_fail(ptr %a) #1 {
 ; CHECK-LABEL: test_rev_fail:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    sub x9, sp, #48
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    adrp x8, .LCPI20_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI20_0
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z1.h, z0.h[8]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z2.h, z0.h[9]
-; CHECK-NEXT:    mov z3.h, z0.h[10]
-; CHECK-NEXT:    mov z4.h, z0.h[11]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.h, z0.h[12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z0.h[13]
-; CHECK-NEXT:    strh w8, [sp, #30]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z3.h, z0.h[14]
-; CHECK-NEXT:    strh w9, [sp, #28]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    mov z4.h, z0.h[15]
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    strh w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w11, s3
-; CHECK-NEXT:    strh w9, [sp, #24]
-; CHECK-NEXT:    umov w9, v0.h[1]
-; CHECK-NEXT:    fmov w12, s4
-; CHECK-NEXT:    strh w10, [sp, #20]
-; CHECK-NEXT:    umov w10, v0.h[3]
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    umov w8, v0.h[2]
-; CHECK-NEXT:    strh w11, [sp, #18]
-; CHECK-NEXT:    umov w11, v0.h[4]
-; CHECK-NEXT:    strh w12, [sp, #16]
-; CHECK-NEXT:    umov w12, v0.h[5]
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    umov w9, v0.h[6]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    umov w8, v0.h[7]
-; CHECK-NEXT:    strh w10, [sp, #8]
-; CHECK-NEXT:    strh w11, [sp, #6]
-; CHECK-NEXT:    strh w12, [sp, #4]
-; CHECK-NEXT:    strh w9, [sp, #2]
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x8]
+; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
new file mode 100644
index 00000000000000..f646319ba5fccb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -0,0 +1,254 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128  < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128
+; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128_NOMAX
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; SVE2_128: .LCPI0_0:
+; SVE2_128-NEXT:        .byte   0                               // 0x0
+; SVE2_128-NEXT:        .byte   7                               // 0x7
+; SVE2_128-NEXT:        .byte   2                               // 0x2
+; SVE2_128-NEXT:        .byte   3                               // 0x3
+; SVE2_128-NEXT:        .byte   4                               // 0x4
+; SVE2_128-NEXT:        .byte   5                               // 0x5
+; SVE2_128-NEXT:        .byte   6                               // 0x6
+; SVE2_128-NEXT:        .byte   7                               // 0x7
+; SVE2_128-NEXT:        .byte   255                             // 0xff
+; SVE2_128-NEXT:        .byte   255                             // 0xff
+define <8 x i8> @shuffle_index_indices_from_op1(ptr %a, ptr %b) {
+; CHECK-LABEL: shuffle_index_indices_from_op1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    tbl z0.b, { z0.b }, z1.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i8>, ptr %a
+  %op2 = load <8 x i8>, ptr %b
+  %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 0, i32 7, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %1
+}
+
+; SVE2_128: .LCPI1_0:
+; SVE2_128-NEXT:        .byte   0                               // 0x0
+; SVE2_128-NEXT:        .byte   1                               // 0x1
+; SVE2_128-NEXT:        .byte   1                               // 0x1
+; SVE2_128-NEXT:        .byte   3                               // 0x3
+; SVE2_128-NEXT:        .byte   4                               // 0x4
+; SVE2_128-NEXT:        .byte   7                               // 0x7
+; SVE2_128-NEXT:        .byte   6                               // 0x6
+; SVE2_128-NEXT:        .byte   7                               // 0x7
+; SVE2_128-NEXT:        .byte   255                             // 0xff
+; SVE2_128-NEXT:        .byte   255                             // 0xff
+define <8 x i8> @shuffle_index_indices_from_op2(ptr %a, ptr %b) {
+; CHECK-LABEL: shuffle_index_indices_from_op2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    tbl z0.b, { z0.b }, z1.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i8>, ptr %a
+  %op2 = load <8 x i8>, ptr %b
+  %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 8, i32 9, i32 9, i32 11, i32 12, i32 15, i32 14, i32 15>
+  ret <8 x i8> %1
+}
+
+; SVE2_128: .LCPI2_0:
+; SVE2_128-NEXT:        .byte   1                               // 0x1
+; SVE2_128-NEXT:        .byte   17                              // 0x11
+; SVE2_128-NEXT:        .byte   18                              // 0x12
+; SVE2_128-NEXT:        .byte   19                              // 0x13
+; SVE2_128-NEXT:        .byte   20                              // 0x14
+; SVE2_128-NEXT:        .byte   20                              // 0x14
+; SVE2_128-NEXT:        .byte   22                              // 0x16
+; SVE2_128-NEXT:        .byte   23                              // 0x17
+; SVE2_128-NEXT:        .byte   255                             // 0xff
+; SVE2_128-NEXT:        .byte   255                             // 0xff
+define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) {
+; SVE2_128-LABEL: shuffle_index_indices_from_both_ops:
+; SVE2_128:       // %bb.0:
+; SVE2_128-NEXT:    adrp x8, .LCPI2_0
+; SVE2_128-NEXT:    ldr d0, [x0]
+; SVE2_128-NEXT:    ldr d1, [x1]
+; SVE2_128-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
+; SVE2_128-NEXT:    tbl z0.b, { z0.b, z1.b }, z2.b
+; SVE2_128-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2_128-NEXT:    ret
+;
+; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
+; SVE2_128_NOMAX:       // %bb.0:
+; SVE2_128_NOMAX-NEXT:    sub sp, sp, #16
+; SVE2_128_NOMAX-NEXT:    .cfi_def_cfa_offset 16
+; SVE2_128_NOMAX-NEXT:    ldr d0, [x1]
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[7]
+; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT:    fmov w8, s1
+; SVE2_128_NOMAX-NEXT:    ldr d1, [x0]
+; SVE2_128_NOMAX-NEXT:    fmov w9, s2
+; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[3]
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #15]
+; SVE2_128_NOMAX-NEXT:    fmov w8, s3
+; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[2]
+; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #14]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT:    fmov w9, s2
+; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #13]
+; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #12]
+; SVE2_128_NOMAX-NEXT:    fmov w8, s3
+; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #11]
+; SVE2_128_NOMAX-NEXT:    fmov w9, s0
+; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #10]
+; SVE2_128_NOMAX-NEXT:    fmov w8, s1
+; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #9]
+; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #8]
+; SVE2_128_NOMAX-NEXT:    ldr d0, [sp, #8]
+; SVE2_128_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_128_NOMAX-NEXT:    ret
+  %op1 = load <8 x i8>, ptr %a
+  %op2 = load <8 x i8>, ptr %b
+  %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 1, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 15>
+  ret <8 x i8> %1
+}
+
+; SVE2_128: .LCPI3_0:
+; SVE2_128-NEXT:        .byte   1                               // 0x1
+; SVE2_128-NEXT:        .byte   17                              // 0x11
+; SVE2_128-NEXT:        .byte   18                              // 0x12
+; SVE2_128-NEXT:        .byte   19                              // 0x13
+; SVE2_128-NEXT:        .byte   20                              // 0x14
+; SVE2_128-NEXT:        .byte   20                              // 0x14
+; SVE2_128-NEXT:        .byte   22                              // 0x16
+; SVE2_128-NEXT:        .byte   0                               // 0x0
+; SVE2_128-NEXT:        .byte   255                             // 0xff
+; SVE2_128-NEXT:        .byte   255                             // 0xff
+define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
+; SVE2_128-LABEL: shuffle_index_poison_value:
+; SVE2_128:       // %bb.0:
+; SVE2_128-NEXT:    adrp x8, .LCPI3_0
+; SVE2_128-NEXT:    ldr d0, [x0]
+; SVE2_128-NEXT:    ldr d1, [x1]
+; SVE2_128-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; SVE2_128-NEXT:    tbl z0.b, { z0.b, z1.b }, z2.b
+; SVE2_128-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2_128-NEXT:    ret
+;
+; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value:
+; SVE2_128_NOMAX:       // %bb.0:
+; SVE2_128_NOMAX-NEXT:    sub sp, sp, #16
+; SVE2_128_NOMAX-NEXT:    .cfi_def_cfa_offset 16
+; SVE2_128_NOMAX-NEXT:    ldr d0, [x1]
+; SVE2_128_NOMAX-NEXT:    ldr d3, [x0]
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[6]
+; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[4]
+; SVE2_128_NOMAX-NEXT:    fmov w8, s1
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z0.b[3]
+; SVE2_128_NOMAX-NEXT:    fmov w9, s2
+; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[2]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #14]
+; SVE2_128_NOMAX-NEXT:    fmov w8, s1
+; SVE2_128_NOMAX-NEXT:    mov z1.b, z3.b[1]
+; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #13]
+; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #12]
+; SVE2_128_NOMAX-NEXT:    fmov w9, s2
+; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #11]
+; SVE2_128_NOMAX-NEXT:    fmov w8, s0
+; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #10]
+; SVE2_128_NOMAX-NEXT:    fmov w9, s1
+; SVE2_128_NOMAX-NEXT:    strb w8, [sp, #9]
+; SVE2_128_NOMAX-NEXT:    strb w9, [sp, #8]
+; SVE2_128_NOMAX-NEXT:    ldr d0, [sp, #8]
+; SVE2_128_NOMAX-NEXT:    add sp, sp, #16
+; SVE2_128_NOMAX-NEXT:    ret
+  %op1 = load <8 x i8>, ptr %a
+  %op2 = load <8 x i8>, ptr %b
+  %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 1, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 poison>
+  ret <8 x i8> %1
+}
+
+define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) {
+; CHECK-LABEL: shuffle_op1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    tbl z0.b, { z0.b }, z1.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %op2 = load <8 x i8>, ptr %b
+  %1 = shufflevector <8 x i8> poison, <8 x i8> %op2, <8 x i32> <i32 1, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 15>
+  ret <8 x i8> %1
+}
+
+; In this function, we could not represent indexes for the second operand
+; because for i8 type, the maximum constant in the mask is 256.
+define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
+; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    mov z1.b, z0.b[7]
+; CHECK-NEXT:    mov z2.b, z0.b[6]
+; CHECK-NEXT:    mov z3.b, z0.b[4]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    mov z2.b, z0.b[3]
+; CHECK-NEXT:    mov z1.b, z1.b[1]
+; CHECK-NEXT:    strb w8, [sp, #15]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z0.b[2]
+; CHECK-NEXT:    strb w9, [sp, #14]
+; CHECK-NEXT:    mov z0.b, z0.b[1]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strb w8, [sp, #13]
+; CHECK-NEXT:    strb w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strb w9, [sp, #11]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    strb w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strb w9, [sp, #9]
+; CHECK-NEXT:    strb w8, [sp, #8]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i8>, ptr %a
+  %op2 = load <8 x i8>, ptr %b
+  %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 1, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 15>
+  ret <8 x i8> %1
+}
+
+; CHECK: .LCPI6_0:
+; CHECK-NEXT:        .byte   0                               // 0x0
+; CHECK-NEXT:        .byte   7                               // 0x7
+; CHECK-NEXT:        .byte   2                               // 0x2
+; CHECK-NEXT:        .byte   3                               // 0x3
+; CHECK-NEXT:        .byte   4                               // 0x4
+; CHECK-NEXT:        .byte   5                               // 0x5
+; CHECK-NEXT:        .byte   6                               // 0x6
+; CHECK-NEXT:        .byte   7                               // 0x7
+; CHECK-NEXT:        .byte   255                             // 0xff
+; CHECK-NEXT:        .byte   255                             // 0xff
+define <8 x i8> @shuffle_index_size_op1_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
+; CHECK-LABEL: shuffle_index_size_op1_maxhw:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    adrp x8, .LCPI6_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI6_0
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x8]
+; CHECK-NEXT:    tbl z0.b, { z0.b }, z1.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i8>, ptr %a
+  %op2 = load <8 x i8>, ptr %b
+  %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 0, i32 7, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %1
+}
diff --git a/llvm/test/CodeGen/AArch64/ushl_sat.ll b/llvm/test/CodeGen/AArch64/ushl_sat.ll
index a86803e5026112..870f80545f9993 100644
--- a/llvm/test/CodeGen/AArch64/ushl_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ushl_sat.ll
@@ -128,9 +128,9 @@ define i16 @combine_shlsat_to_shl(i16 %x) nounwind {
 define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind {
 ; CHECK-LABEL: combine_shlsat_to_shl_no_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xfffc
-; CHECK-NEXT:    lsl w9, w8, #17
-; CHECK-NEXT:    lsl w8, w8, #14
+; CHECK-NEXT:    lsl w8, w0, #14
+; CHECK-NEXT:    and w8, w8, #0x3fff0000
+; CHECK-NEXT:    lsl w9, w8, #3
 ; CHECK-NEXT:    cmp w8, w9, lsr #3
 ; CHECK-NEXT:    csinv w8, w9, wzr, eq
 ; CHECK-NEXT:    lsr w0, w8, #16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 2263672adb9458..7028d1157787fe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -77,7 +77,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX7-NEXT:    s_and_b32 s1, s2, 1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
@@ -150,7 +153,10 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
@@ -360,7 +366,10 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -402,7 +411,10 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -447,7 +459,10 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -492,7 +507,10 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 5b3cba2000de6f..0417b97a53c0ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -318,6 +318,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -334,9 +336,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
 ; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_f64:
@@ -381,6 +381,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[0:1]
 ; CI-NEXT:    v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -394,9 +396,7 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fast_frem_f64:
@@ -438,6 +438,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[0:1]
 ; CI-NEXT:    v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -451,9 +453,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: unsafe_frem_f64:
@@ -532,15 +532,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
 ; CI-NEXT:    v_trunc_f32_e32 v3, v3
 ; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT:    v_or_b32_e32 v2, v0, v1
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f16:
@@ -669,15 +669,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v3
 ; CI-NEXT:    v_trunc_f32_e32 v5, v5
 ; CI-NEXT:    v_fma_f32 v3, -v5, v4, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v1, v2, v1
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v4f16:
@@ -1017,6 +1017,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_mov_b32_e32 v1, s9
 ; CI-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
 ; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -1043,9 +1045,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
 ; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; CI-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
-; CI-NEXT:    v_mov_b32_e32 v4, s4
-; CI-NEXT:    v_mov_b32_e32 v5, s5
-; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index d2b2aef8077f94..d6957be8ab8ffb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -50,11 +50,12 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_or_b32 s2, s0, s2
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_s_s:
@@ -135,19 +136,21 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_s_s:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s0, s3, 1
 ; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, s1, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, s1, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_s_s:
@@ -228,14 +231,15 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    s_and_b32 s1, s4, 1
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, s0, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_v_s:
@@ -318,15 +322,16 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX7-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v0
 ; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_not_b32_e32 v0, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_s_v:
@@ -410,15 +415,16 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v1
-; GFX7-NEXT:    v_not_b32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_v_v:
@@ -499,19 +505,21 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_s_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_s_v:
@@ -590,19 +598,21 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_v_s:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s0, s2, 1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_v_s:
@@ -681,19 +691,21 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_v_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_v_v:
@@ -842,7 +854,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_s_s:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s1, s3, 1
 ; GFX7-NEXT:    s_lshr_b32 s0, s3, 1
 ; GFX7-NEXT:    s_and_b32 s2, s2, 0xffff
@@ -851,16 +866,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v4, s1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, s2, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_s_s:
@@ -980,15 +994,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s4
 ; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
-; GFX7-NEXT:    v_or_b32_e32 v4, s3, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, s3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v4i16_v_s:
@@ -1119,15 +1134,16 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_not_b32_e32 v0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v4, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v4i16_s_v:
@@ -1258,15 +1274,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v4i16_v_v:
@@ -1376,25 +1393,27 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_s_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v6, s0, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v4, s0, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_not_b32_e32 v2, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_s_v:
@@ -1498,7 +1517,10 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_v_s:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s1, s2, 1
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1507,16 +1529,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v5, s1, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_v_s:
@@ -1619,25 +1640,27 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_v_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_not_b32_e32 v3, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, v5, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_v_v:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
index ea5a0c5aceb5d0..f26e23293dae79 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
@@ -31,6 +31,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -42,30 +43,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_4
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -100,6 +106,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_2
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -111,30 +118,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_2
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_2
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_2
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_2
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -169,6 +181,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -180,30 +193,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -238,6 +256,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v2s32
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -249,30 +268,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_global_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_global_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_global_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_global_v2s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -307,6 +331,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -318,30 +343,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v4s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_global_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_global_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_global_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_global_v4s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -368,39 +398,55 @@ body: |
     ; GFX6-LABEL: name: load_global_s64
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_global_s64
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -427,39 +473,55 @@ body: |
     ; GFX6-LABEL: name: load_global_v2s64
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v2s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_global_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_global_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_global_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_global_v2s64
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -489,36 +551,42 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX7-LABEL: name: load_global_v2p1
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2p1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX8-LABEL: name: load_global_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: load_global_v2p1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX10-LABEL: name: load_global_v2p1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX11-LABEL: name: load_global_v2p1
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -548,36 +616,42 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX7-LABEL: name: load_global_s128
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s128
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX8-LABEL: name: load_global_s128
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX9-LABEL: name: load_global_s128
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX10-LABEL: name: load_global_s128
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX11-LABEL: name: load_global_s128
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -604,39 +678,55 @@ body: |
     ; GFX6-LABEL: name: load_global_p3_from_4
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load (p3), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_p3_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1)
+    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_p3_from_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_global_p3_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_global_p3_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_global_p3_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_global_p3_from_4
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -663,39 +753,55 @@ body: |
     ; GFX6-LABEL: name: load_global_p1_from_8
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load (p1), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_p1_from_8
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_p1_from_8
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_global_p1_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_global_p1_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_global_p1_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_global_p1_from_8
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -725,36 +831,42 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX7-LABEL: name: load_global_p999_from_8
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_p999_from_8
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX8-LABEL: name: load_global_p999_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX9-LABEL: name: load_global_p999_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX10-LABEL: name: load_global_p999_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX11-LABEL: name: load_global_p999_from_8
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -784,36 +896,42 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX7-LABEL: name: load_global_v2p3
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2p3
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX8-LABEL: name: load_global_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: load_global_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX10-LABEL: name: load_global_v2p3
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX11-LABEL: name: load_global_v2p3
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -840,39 +958,55 @@ body: |
     ; GFX6-LABEL: name: load_global_v2s16
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v2s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
+    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_global_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_global_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_global_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_global_v2s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -899,39 +1033,55 @@ body: |
     ; GFX6-LABEL: name: load_global_v4s16
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v4s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v4s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_global_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_global_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_global_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_global_v4s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -958,39 +1108,55 @@ body: |
     ; GFX6-LABEL: name: load_global_v8s16
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v8s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v8s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_global_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_global_v8s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_global_v8s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_global_v8s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1029,6 +1195,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1040,6 +1207,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1056,6 +1224,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1072,18 +1241,21 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1120,6 +1292,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1131,6 +1304,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1147,6 +1321,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1163,12 +1338,14 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1185,6 +1362,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1231,6 +1409,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1252,6 +1431,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1268,6 +1448,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1284,18 +1465,21 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1342,6 +1526,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1363,6 +1548,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1379,6 +1565,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1395,18 +1582,21 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1443,6 +1633,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1454,6 +1645,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1470,6 +1662,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1486,12 +1679,14 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1508,6 +1703,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1545,6 +1741,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1557,6 +1754,7 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1573,6 +1771,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1589,6 +1788,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1605,6 +1805,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1621,6 +1822,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1677,6 +1879,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1698,6 +1901,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1714,6 +1918,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1730,12 +1935,14 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1752,6 +1959,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1798,6 +2006,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1819,6 +2028,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1835,6 +2045,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1851,12 +2062,14 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1873,6 +2086,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1910,6 +2124,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1922,6 +2137,7 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1938,6 +2154,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1954,6 +2171,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1970,6 +2188,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1986,6 +2205,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -2033,6 +2253,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -2045,6 +2266,7 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -2061,6 +2283,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2077,6 +2300,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2093,6 +2317,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2109,6 +2334,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -2165,6 +2391,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -2186,6 +2413,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -2202,6 +2430,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2218,6 +2447,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2234,6 +2464,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2250,6 +2481,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -2306,6 +2538,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -2327,6 +2560,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -2343,6 +2577,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2359,6 +2594,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2375,6 +2611,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2391,6 +2628,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
index f36cb1d9782140..c56ba70b667d96 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
@@ -29,6 +29,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s32_to_4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -40,24 +41,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s32_to_4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s32_to_4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s32_to_4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -91,6 +96,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s32_to_2
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -102,24 +108,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_2
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s32_to_2
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s32_to_2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s16), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s32_to_2
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -153,6 +163,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s32_to_1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -164,24 +175,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s32_to_1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s32_to_1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s8), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s32_to_1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -208,33 +223,48 @@ body: |
     ; GFX6-LABEL: name: store_global_s64
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
-    ; GFX6-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -263,30 +293,35 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX6-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s128
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s128
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s128
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s128
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s128
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -321,6 +356,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
@@ -332,24 +368,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -383,6 +423,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
@@ -394,24 +435,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v4s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -438,33 +483,48 @@ body: |
     ; GFX6-LABEL: name: store_global_v2s16
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2
-    ; GFX6-NEXT: G_STORE [[COPY1]](<2 x s16>), [[COPY]](p1) :: (store (<2 x s16>), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -491,33 +551,48 @@ body: |
     ; GFX6-LABEL: name: store_global_v4s16
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
-    ; GFX6-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v4s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v4s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -544,33 +619,48 @@ body: |
     ; GFX6-LABEL: name: store_global_v8s16
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX6-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v8s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v8s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v8s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v8s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -597,33 +687,48 @@ body: |
     ; GFX6-LABEL: name: store_global_v2s64
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX6-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -650,33 +755,48 @@ body: |
     ; GFX6-LABEL: name: store_global_p1
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3
-    ; GFX6-NEXT: G_STORE [[COPY1]](p1), [[COPY]](p1) :: (store (p1), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p1), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_p1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p1), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_p1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p1), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -706,30 +826,35 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX6-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2p1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2p1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -756,33 +881,48 @@ body: |
     ; GFX6-LABEL: name: store_global_p3
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX6-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_p3
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_p3
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p3), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -812,30 +952,35 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX6-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2p3
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2p3
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -869,6 +1014,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_atomic_global_s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -880,24 +1026,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_atomic_global_s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_atomic_global_s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_atomic_global_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_atomic_global_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -932,6 +1082,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_atomic_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
@@ -943,24 +1094,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_atomic_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_atomic_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_atomic_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_atomic_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -995,6 +1150,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s32_gep_2047
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -1006,6 +1162,7 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s32_gep_2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1022,6 +1179,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -1038,12 +1196,14 @@ body: |
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s32_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s32_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 7160f03d2c3d62..a5482bd5b79a96 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -748,11 +748,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX7-NEXT:    s_nop 3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_nop 1
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index a325288ca1601f..13f78852071051 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -174,24 +174,21 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
 define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) {
 ; GFX7-LABEL: test_div_scale_f64_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_1:
@@ -263,24 +260,21 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
 define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) {
 ; GFX7-LABEL: test_div_scale_f64_2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_2:
@@ -649,19 +643,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
 ; GFX7-LABEL: test_div_scale_f64_scalar_num_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[8:9]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_scalar_num_1:
@@ -724,19 +718,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
 ; GFX7-LABEL: test_div_scale_f64_scalar_num_2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[8:9], v[0:1], s[8:9]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_scalar_num_2:
@@ -799,19 +793,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
 ; GFX7-LABEL: test_div_scale_f64_scalar_den_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[8:9], s[8:9], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_scalar_den_1:
@@ -874,19 +868,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
 ; GFX7-LABEL: test_div_scale_f64_scalar_den_2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[8:9], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_scalar_den_2:
@@ -1071,9 +1065,9 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_all_scalar_1:
@@ -1131,9 +1125,9 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_all_scalar_2:
@@ -1644,14 +1638,14 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
 define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 {
 ; GFX7-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s3, 0x40200000
 ; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_val_undef_val:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 8506efc1d97862..043e69abaeef2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -31,9 +31,9 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
   store i64 %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 469e6ddad0a070..c28d204784d4b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -4,8 +4,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s
 
-; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
 
 define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -112,6 +111,56 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v3
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v6, v5
 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
+; GFX6-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
+; GFX6-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
+; GFX6-NEXT:    buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:5
+; GFX6-NEXT:    buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:7
+; GFX6-NEXT:    buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:6
+; GFX6-NEXT:    buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:9
+; GFX6-NEXT:    buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11
+; GFX6-NEXT:    buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:10
+; GFX6-NEXT:    buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX6-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(11)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(10)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(6)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
+; GFX6-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
+; GFX6-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX6-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_or_b32_e32 v3, v4, v12
+; GFX6-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v9
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v4, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v6, v5
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
   ret <3 x i32> %load
 }
@@ -176,6 +225,32 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v7
 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align2:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2
+; GFX6-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:6
+; GFX6-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10
+; GFX6-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX6-NEXT:    buffer_load_ushort v7, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
   ret <3 x i32> %load
 }
@@ -197,6 +272,20 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
   ret <3 x i32> %load
 }
@@ -218,6 +307,20 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_i96_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load i96, ptr addrspace(4) %ptr, align 8
   ret i96 %load
 }
@@ -239,6 +342,20 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
   ret <3 x i32> %load
 }
@@ -266,6 +383,23 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v6i16_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v2, v7
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
   ret <6 x i16> %load
 }
@@ -313,6 +447,29 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v12i8_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[12:13], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v8, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v12
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
+; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
+; GFX6-NEXT:    v_mov_b32_e32 v0, v12
+; GFX6-NEXT:    v_mov_b32_e32 v4, v13
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
   ret <12 x i8> %load
 }
@@ -334,6 +491,16 @@ define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
   ret <3 x i32> %load
 }
@@ -451,6 +618,57 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v3i32_align1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:1
+; GFX6-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:3
+; GFX6-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0 offset:2
+; GFX6-NEXT:    buffer_load_ubyte v3, off, s[0:3], 0 offset:5
+; GFX6-NEXT:    buffer_load_ubyte v4, off, s[0:3], 0 offset:7
+; GFX6-NEXT:    buffer_load_ubyte v5, off, s[0:3], 0 offset:6
+; GFX6-NEXT:    buffer_load_ubyte v6, off, s[0:3], 0 offset:9
+; GFX6-NEXT:    buffer_load_ubyte v7, off, s[0:3], 0 offset:11
+; GFX6-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0 offset:10
+; GFX6-NEXT:    buffer_load_ubyte v9, off, s[0:3], 0
+; GFX6-NEXT:    buffer_load_ubyte v10, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_load_ubyte v11, off, s[0:3], 0 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(11)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(10)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(6)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX6-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_or_b32_e32 v2, v3, v10
+; GFX6-NEXT:    v_or_b32_e32 v3, v4, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v11
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
   ret <3 x i32> %load
 }
@@ -523,6 +741,33 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v3i32_align2:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX6-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:6
+; GFX6-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:10
+; GFX6-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
+; GFX6-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
   ret <3 x i32> %load
 }
@@ -545,6 +790,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
 ; GFX7-NEXT:    s_mov_b32 s0, s4
 ; GFX7-NEXT:    s_mov_b32 s1, s5
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v3i32_align4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
   ret <3 x i32> %load
 }
@@ -567,6 +821,15 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s0, s4
 ; GFX7-NEXT:    s_mov_b32 s1, s5
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_i96_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load i96, ptr addrspace(4) %ptr, align 8
   ret i96 %load
 }
@@ -589,6 +852,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
 ; GFX7-NEXT:    s_mov_b32 s0, s4
 ; GFX7-NEXT:    s_mov_b32 s1, s5
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v3i32_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
   ret <3 x i32> %load
 }
@@ -611,6 +883,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
 ; GFX7-NEXT:    s_mov_b32 s0, s4
 ; GFX7-NEXT:    s_mov_b32 s1, s5
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v6i16_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
   %cast = bitcast <6 x i16> %load to <3 x i32>
   ret <3 x i32> %cast
@@ -652,6 +933,24 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
 ; GFX7-NEXT:    s_mov_b32 s0, s12
 ; GFX7-NEXT:    s_mov_b32 s4, s13
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v12i8_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshr_b32 s1, s12, 8
+; GFX6-NEXT:    s_lshr_b32 s2, s12, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s12, 24
+; GFX6-NEXT:    s_lshr_b32 s5, s13, 8
+; GFX6-NEXT:    s_lshr_b32 s6, s13, 16
+; GFX6-NEXT:    s_lshr_b32 s7, s13, 24
+; GFX6-NEXT:    s_lshr_b32 s9, s8, 8
+; GFX6-NEXT:    s_lshr_b32 s10, s8, 16
+; GFX6-NEXT:    s_lshr_b32 s11, s8, 24
+; GFX6-NEXT:    s_mov_b32 s0, s12
+; GFX6-NEXT:    s_mov_b32 s4, s13
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
   ret <12 x i8> %load
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 50f143dc448c3e..7ad19a47970039 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -329,16 +329,13 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s0
+; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_or_b32_e32 v0, 0xff800000, v1
-; GFX7-NEXT:    v_mul_i32_i24_e32 v1, -7, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[1:2], 3
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v5
-; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_or_b32_e32 v1, 0xff800000, v1
+; GFX7-NEXT:    v_mul_i32_i24_e32 v1, -7, v1
+; GFX7-NEXT:    v_lshl_b64 v[3:4], v[1:2], 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX7-NEXT:    buffer_store_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: muli24_shl64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 9bc61895038871..c040c912b9ff79 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -1,5 +1,4 @@
-; FIXME: Need to add support for mubuf stores to enable this on SI.
-; XUN: llc < %s -march=amdgcn -mcpu=tahiti -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=SI,GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=SI,GCN %s
 ; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=CI,GCN,SICIVI %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=VI,GCN,SICIVI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -show-mc-encoding -verify-machineinstrs -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10  %s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
new file mode 100644
index 00000000000000..e04b2b71277d0b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
@@ -0,0 +1,557 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+
+define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) {
+; GFX9-LABEL: v_sub_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i16> %a, %b
+  ret <2 x i16> %sub
+}
+
+define <2 x i16> @v_sub_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
+; GFX9-LABEL: v_sub_v2i16_fneg_lhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_v2i16_fneg_lhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_v2i16_fneg_lhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_v2i16_fneg_lhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = fneg <2 x half> %a
+  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
+  %sub = sub <2 x i16> %cast.neg.a, %b
+  ret <2 x i16> %sub
+}
+
+define <2 x i16> @v_sub_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
+; GFX9-LABEL: v_sub_v2i16_fneg_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_v2i16_fneg_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_v2i16_fneg_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_v2i16_fneg_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.b = fneg <2 x half> %b
+  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
+  %sub = sub <2 x i16> %a, %cast.neg.b
+  ret <2 x i16> %sub
+}
+
+define <2 x i16> @v_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
+; GFX9-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = fneg <2 x half> %a
+  %neg.b = fneg <2 x half> %b
+  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
+  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
+  %sub = sub <2 x i16> %cast.neg.a, %cast.neg.b
+  ret <2 x i16> %sub
+}
+
+define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
+; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_splat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc0ffc0
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffffffc0
+; GFX8-NEXT:    v_subrev_u16_e32 v1, 0xffc0, v0
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_splat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i16> %a, <i16 -64, i16 -64>
+  ret <2 x i16> %sub
+}
+
+define <2 x i16> @v_sub_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
+; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_lo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4ffc0
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_lo:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 4
+; GFX8-NEXT:    v_subrev_u16_e32 v1, 0xffc0, v0
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_lo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x4ffc0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_lo:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0x4ffc0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i16> %a, <i16 -64, i16 4>
+  ret <2 x i16> %sub
+}
+
+define <2 x i16> @v_sub_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
+; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_hi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc00004
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_hi:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffc0
+; GFX8-NEXT:    v_subrev_u16_e32 v2, 4, v0
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_hi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xffc00004
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_hi:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xffc00004
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i16> %a, <i16 4, i16 -64>
+  ret <2 x i16> %sub
+}
+
+define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
+; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_splat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    s_sub_i32 s0, s0, 0xffc0ffc0
+; GFX9-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_splat:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffc0
+; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_splat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX10-NEXT:    s_sub_i32 s0, s0, 0xffc0ffc0
+; GFX10-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_splat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-NEXT:    s_sub_i32 s0, s0, 0xffc0ffc0
+; GFX11-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i16> %a, <i16 -64, i16 -64>
+  %cast = bitcast <2 x i16> %sub to i32
+  ret i32 %cast
+}
+
+define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
+; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_lo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    s_sub_i32 s0, s0, 0x4ffc0
+; GFX9-NEXT:    s_sub_i32 s1, s1, 4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_lo:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffc0
+; GFX8-NEXT:    s_sub_i32 s1, s1, 4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_lo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX10-NEXT:    s_sub_i32 s0, s0, 0x4ffc0
+; GFX10-NEXT:    s_sub_i32 s1, s1, 4
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_lo:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-NEXT:    s_sub_i32 s0, s0, 0x4ffc0
+; GFX11-NEXT:    s_sub_i32 s1, s1, 4
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i16> %a, <i16 -64, i16 4>
+  %cast = bitcast <2 x i16> %sub to i32
+  ret i32 %cast
+}
+
+define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
+; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_hi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-NEXT:    s_sub_i32 s0, s0, 0xffc00004
+; GFX9-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_hi:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_sub_i32 s0, s0, 4
+; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_hi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX10-NEXT:    s_sub_i32 s0, s0, 0xffc00004
+; GFX10-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_hi:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-NEXT:    s_sub_i32 s0, s0, 0xffc00004
+; GFX11-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i16> %a, <i16 4, i16 -64>
+  %cast = bitcast <2 x i16> %sub to i32
+  ret i32 %cast
+}
+
+define amdgpu_ps i32 @s_sub_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GFX9-LABEL: s_sub_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s1, s2, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_sub_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_sub_i32 s0, s0, s1
+; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_sub_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_sub_i32 s0, s0, s1
+; GFX10-NEXT:    s_sub_i32 s1, s2, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_sub_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-NEXT:    s_sub_i32 s0, s0, s1
+; GFX11-NEXT:    s_sub_i32 s1, s2, s3
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i16> %a, %b
+  %cast = bitcast <2 x i16> %sub to i32
+  ret i32 %cast
+}
+
+define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) {
+; GFX9-LABEL: s_sub_v2i16_fneg_lhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s1, s2, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_sub_v2i16_fneg_lhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_sub_i32 s0, s0, s1
+; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_sub_v2i16_fneg_lhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_sub_i32 s0, s0, s1
+; GFX10-NEXT:    s_sub_i32 s1, s2, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_sub_v2i16_fneg_lhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-NEXT:    s_sub_i32 s0, s0, s1
+; GFX11-NEXT:    s_sub_i32 s1, s2, s3
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %neg.a = fneg <2 x half> %a
+  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
+  %sub = sub <2 x i16> %cast.neg.a, %b
+  %cast = bitcast <2 x i16> %sub to i32
+  ret i32 %cast
+}
+
+define amdgpu_ps i32 @s_sub_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) {
+; GFX9-LABEL: s_sub_v2i16_fneg_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s1, s2, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_sub_v2i16_fneg_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_sub_i32 s0, s0, s1
+; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_sub_v2i16_fneg_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_sub_i32 s0, s0, s1
+; GFX10-NEXT:    s_sub_i32 s1, s2, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_sub_v2i16_fneg_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-NEXT:    s_sub_i32 s0, s0, s1
+; GFX11-NEXT:    s_sub_i32 s1, s2, s3
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %neg.b = fneg <2 x half> %b
+  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
+  %sub = sub <2 x i16> %a, %cast.neg.b
+  %cast = bitcast <2 x i16> %sub to i32
+  ret i32 %cast
+}
+
+define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX9-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX9-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s1, s2, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_sub_i32 s0, s0, s1
+; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX10-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    s_sub_i32 s0, s0, s1
+; GFX10-NEXT:    s_sub_i32 s1, s2, s3
+; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; GFX11-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-NEXT:    s_sub_i32 s0, s0, s1
+; GFX11-NEXT:    s_sub_i32 s1, s2, s3
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %neg.a = fneg <2 x half> %a
+  %neg.b = fneg <2 x half> %b
+  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
+  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
+  %sub = sub <2 x i16> %cast.neg.a, %cast.neg.b
+  %cast = bitcast <2 x i16> %sub to i32
+  ret i32 %cast
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 1b8216f4aa2a61..c793f9ee682f8c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7149,7 +7149,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_movk_i32 s8, 0x11f
@@ -7269,7 +7269,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -7533,21 +7533,21 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_movk_i32 s6, 0xf001
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
 ; GFX6-NEXT:    s_movk_i32 s8, 0xfff
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
@@ -7647,7 +7647,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7834,7 +7834,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7954,7 +7954,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -8283,7 +8283,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8399,7 +8399,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -8589,14 +8589,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_sub_u32 s4, 0, s10
 ; GFX6-NEXT:    s_subb_u32 s5, 0, s11
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_add_u32 s2, s2, s12
@@ -8724,13 +8724,13 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s8
 ; GFX9-NEXT:    s_subb_u32 s1, 0, s9
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
+; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
@@ -8944,14 +8944,14 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_movk_i32 s6, 0xf001
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
+; GFX6-NEXT:    s_movk_i32 s6, 0xf001
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9073,7 +9073,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
+; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9789,7 +9789,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s2, -1
@@ -9903,7 +9903,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -10093,14 +10093,14 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_sub_u32 s4, 0, s8
 ; GFX6-NEXT:    s_subb_u32 s5, 0, s9
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_add_u32 s2, s2, s10
@@ -10226,13 +10226,13 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s8
 ; GFX9-NEXT:    s_subb_u32 s1, 0, s9
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
+; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index 857784282528bc..7209d160e6c8a7 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -47,7 +47,7 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F
+  ; GCN-NEXT:   liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr57 = COPY $vgpr9, implicit $exec
   ; GCN-NEXT:   renamable $vgpr56 = COPY $vgpr8, implicit $exec
@@ -62,15 +62,17 @@ body:             |
   ; GCN-NEXT:   renamable $sgpr16_sgpr17 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
   ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.1, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.1 + 4, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15, implicit $vgpr14_vgpr15 :: (store (s32) into %stack.1, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr14_vgpr15 :: (store (s32) into %stack.1 + 4, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.2, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.2 + 4, addrspace 5)
   ; GCN-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0
-  ; GCN-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2, addrspace 5)
-  ; GCN-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2 + 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1, addrspace 5)
+  ; GCN-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1 + 4, addrspace 5)
   ; GCN-NEXT:   renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, killed $vgpr45_vgpr46, 0, killed $vgpr41_vgpr42, 0, killed $vgpr60_vgpr61, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   FLAT_STORE_DWORDX2 killed renamable $vgpr58_vgpr59, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
-  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1, addrspace 5)
-  ; GCN-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1 + 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2 + 4, addrspace 5)
   ; GCN-NEXT:   FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr56_vgpr57, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 2184478635e0e3..cb1b664549c9a3 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -25,12 +25,12 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v10
 ; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v11
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
-; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; GFX9-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
 ; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GFX9-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
@@ -171,12 +171,12 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GFX9-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GFX9-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
@@ -312,12 +312,12 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v10
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; GFX9-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
 ; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GFX9-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
@@ -454,12 +454,12 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GFX9-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GFX9-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
@@ -709,118 +709,118 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v11
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v3, v11, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v11
-; GFX9-NEXT:    v_xor_b32_e32 v3, v4, v11
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, 0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v5
-; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v8, 0
-; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v12
-; GFX9-NEXT:    v_mul_hi_u32 v13, v8, v4
-; GFX9-NEXT:    v_add3_u32 v7, v5, v7, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v7, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v4, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v7, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v13, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v8, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v9, v12
-; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v13
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v13, 0
-; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v7, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v7, 0
-; GFX9-NEXT:    v_mul_hi_u32 v14, v13, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v4, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v10, v3, v9
+; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v10
+; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v11
+; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v12
+; GFX9-NEXT:    v_mul_hi_u32 v13, v6, v2
+; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v12
+; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
+; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
+; GFX9-NEXT:    v_mul_hi_u32 v14, v13, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v13, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v12, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v7
-; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
-; GFX9-NEXT:    v_mul_hi_u32 v9, v8, v4
-; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v4, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v5, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v9, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v10, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v10, v3, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v4, 0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v10, v9
-; GFX9-NEXT:    v_sub_u32_e32 v9, v6, v1
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v9, v2, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v0, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v3
+; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0
+; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v2
+; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v2
+; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
+; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v6
+; GFX9-NEXT:    v_sub_u32_e32 v6, v4, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v5, v0
+; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v6, v10, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v11
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[6:7], 0, v6, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[6:7]
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 2, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7]
-; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 1, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v5, s[6:7]
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v16, v14, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v12, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
+; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 2, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v3, s[6:7]
+; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 1, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v3, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v16, v14, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v8, v2, s[4:5]
-; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[4:5], v9, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v15, v13, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v5, v7, v9
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v5
+; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v2, v5
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v6, v10, s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[8:9], v3, v5, s[8:9]
+; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[4:5], v8, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v15, v13, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v6, v7, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v12, v2, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, v3, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v3, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v6
-; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v4, v6
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
-; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[8:9], v5, v6, s[8:9]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
 ; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v0, v7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v7, vcc
@@ -884,12 +884,12 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GFX9-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GFX9-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index a9a6075516f6c3..50693a92bc92cb 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -1804,12 +1804,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; CISI-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CISI-NEXT:    s_sub_u32 s0, 0, s2
 ; CISI-NEXT:    s_subb_u32 s1, 0, s3
-; CISI-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; CISI-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; CISI-NEXT:    v_rcp_f32_e32 v0, v0
 ; CISI-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CISI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CISI-NEXT:    v_trunc_f32_e32 v1, v1
-; CISI-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; CISI-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; CISI-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CISI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CISI-NEXT:    v_mul_lo_u32 v2, s0, v1
@@ -1954,12 +1954,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; VI-NEXT:    s_sub_u32 s8, 0, s2
 ; VI-NEXT:    s_subb_u32 s9, 0, s3
-; VI-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; VI-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; VI-NEXT:    v_rcp_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; VI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; VI-NEXT:    v_trunc_f32_e32 v1, v1
-; VI-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; VI-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v4, v1
 ; VI-NEXT:    v_cvt_u32_f32_e32 v5, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, s8, v4
@@ -2111,12 +2111,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s2
 ; GFX9-NEXT:    s_subb_u32 s1, 0, s3
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s10, v1
@@ -2279,12 +2279,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX1010-NEXT:    s_sub_u32 s9, 0, s2
 ; GFX1010-NEXT:    s_subb_u32 s10, 0, s3
-; GFX1010-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX1010-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1010-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1010-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX1010-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX1010-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1010-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX1010-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1010-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1010-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1010-NEXT:    v_readfirstlane_b32 s0, v1
@@ -2441,12 +2441,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX1030W32-NEXT:    s_sub_u32 s9, 0, s2
 ; GFX1030W32-NEXT:    s_subb_u32 s10, 0, s3
-; GFX1030W32-NEXT:    v_fmac_f32_e32 v0, 0x4f800000, v1
+; GFX1030W32-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
 ; GFX1030W32-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1030W32-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX1030W32-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX1030W32-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1030W32-NEXT:    v_fmac_f32_e32 v0, 0xcf800000, v1
+; GFX1030W32-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
 ; GFX1030W32-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1030W32-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1030W32-NEXT:    v_readfirstlane_b32 s0, v1
@@ -2603,12 +2603,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX1030W64-NEXT:    s_sub_u32 s9, 0, s2
 ; GFX1030W64-NEXT:    s_subb_u32 s10, 0, s3
-; GFX1030W64-NEXT:    v_fmac_f32_e32 v0, 0x4f800000, v1
+; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
 ; GFX1030W64-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1030W64-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX1030W64-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX1030W64-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1030W64-NEXT:    v_fmac_f32_e32 v0, 0xcf800000, v1
+; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1030W64-NEXT:    v_readfirstlane_b32 s8, v1
@@ -2766,7 +2766,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_sub_u32 s9, 0, s2
 ; GFX11-NEXT:    s_subb_u32 s10, 0, s3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v0, 0x4f800000, v1
+; GFX11-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2774,7 +2774,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fmac_f32_e32 v0, 0xcf800000, v1
+; GFX11-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index fe649d43330417..9fb0cab068d286 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -7,6 +7,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0x3ca3d70a
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
@@ -36,34 +37,33 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GFX10-NEXT:    v_fma_f32 v1, v1, v5, s28
 ; GFX10-NEXT:    v_max_f32_e64 v6, s0, s0 clamp
 ; GFX10-NEXT:    v_add_f32_e64 v5, s29, -1.0
-; GFX10-NEXT:    v_sub_f32_e32 v8, s0, v1
-; GFX10-NEXT:    v_fma_f32 v7, -s2, v6, s6
+; GFX10-NEXT:    v_sub_f32_e32 v9, s0, v1
+; GFX10-NEXT:    v_fma_f32 v8, -s2, v6, s6
 ; GFX10-NEXT:    v_fma_f32 v5, v6, v5, 1.0
-; GFX10-NEXT:    v_mad_f32 v10, s2, v6, v2
-; GFX10-NEXT:    s_mov_b32 s0, 0x3c23d70a
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v6, v8
-; GFX10-NEXT:    v_fmac_f32_e32 v10, v7, v6
+; GFX10-NEXT:    v_mad_f32 v11, s2, v6, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v6, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v8, v6
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mul_f32_e32 v9, s10, v0
+; GFX10-NEXT:    v_mul_f32_e32 v10, s10, v0
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, s10, s14
-; GFX10-NEXT:    v_mul_f32_e32 v8, s18, v2
+; GFX10-NEXT:    v_mul_f32_e32 v9, s18, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v3, s22, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v9, v0, v6
+; GFX10-NEXT:    v_fmac_f32_e32 v10, v0, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v1, v5
-; GFX10-NEXT:    v_mul_f32_e32 v1, v8, v6
-; GFX10-NEXT:    v_mul_f32_e32 v7, v6, v3
-; GFX10-NEXT:    v_fma_f32 v3, -v6, v3, v9
+; GFX10-NEXT:    v_mul_f32_e32 v1, v9, v6
+; GFX10-NEXT:    v_mul_f32_e32 v8, v6, v3
+; GFX10-NEXT:    v_fma_f32 v3, -v6, v3, v10
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v0, v6
 ; GFX10-NEXT:    v_fma_f32 v0, v2, s26, -v1
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v6
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v3, v6
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v0, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v6
-; GFX10-NEXT:    v_fmaak_f32 v4, s0, v5, 0x3ca3d70a
+; GFX10-NEXT:    v_fmamk_f32 v4, v5, 0x3c23d70a, v7
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v4
+; GFX10-NEXT:    v_mul_f32_e32 v2, v8, v4
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -71,7 +71,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GFX11-LABEL: _amdgpu_ps_main:
 ; GFX11:       ; %bb.0: ; %.entry
 ; GFX11-NEXT:    image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0x3ca3d70a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
@@ -96,43 +96,40 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GFX11-NEXT:    s_buffer_load_b128 s[20:23], s[0:3], 0x70
 ; GFX11-NEXT:    v_fma_f32 v1, v1, v5, s28
 ; GFX11-NEXT:    v_max_f32_e64 v6, s0, s0 clamp
-; GFX11-NEXT:    s_buffer_load_b128 s[24:27], s[0:3], 0x10
 ; GFX11-NEXT:    v_add_f32_e64 v5, s29, -1.0
+; GFX11-NEXT:    s_buffer_load_b128 s[24:27], s[0:3], 0x10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_f32_e32 v8, s0, v1
-; GFX11-NEXT:    v_fma_f32 v7, -s2, v6, s6
-; GFX11-NEXT:    v_fma_f32 v10, s2, v6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_f32_e32 v9, s0, v1
+; GFX11-NEXT:    v_fma_f32 v8, -s2, v6, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_fma_f32 v5, v6, v5, 1.0
-; GFX11-NEXT:    s_mov_b32 s0, 0x3c23d70a
+; GFX11-NEXT:    v_fma_f32 v11, s2, v6, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f32_e32 v9, s10, v0
+; GFX11-NEXT:    v_mul_f32_e32 v10, s10, v0
 ; GFX11-NEXT:    v_fma_f32 v0, -v0, s10, s14
-; GFX11-NEXT:    v_mul_f32_e32 v3, s22, v3
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v6, v8 :: v_dual_mul_f32 v8, s18, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_fmac_f32_e32 v9, v0, v6
-; GFX11-NEXT:    v_dual_fmac_f32 v10, v7, v6 :: v_dual_mul_f32 v7, v6, v3
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v6, v9
+; GFX11-NEXT:    v_mul_f32_e32 v9, s18, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fmac_f32_e32 v10, v0, v6
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v1, v5
-; GFX11-NEXT:    v_fma_f32 v3, -v6, v3, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v3, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v5, v0, v6
-; GFX11-NEXT:    v_mul_f32_e32 v1, v8, v6
+; GFX11-NEXT:    v_mul_f32_e32 v3, s22, v3
+; GFX11-NEXT:    v_dual_fmac_f32 v11, v8, v6 :: v_dual_mul_f32 v8, v6, v3
+; GFX11-NEXT:    v_mul_f32_e32 v1, v9, v6
+; GFX11-NEXT:    v_fma_f32 v3, -v6, v3, v10
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v4, v6 :: v_dual_fmaak_f32 v4, s0, v5, 0x3ca3d70a
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v11
 ; GFX11-NEXT:    v_fma_f32 v0, v2, s26, -v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v1, v0, v6
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v6
-; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT:    v_fmac_f32_e32 v8, v3, v6
+; GFX11-NEXT:    v_dual_mul_f32 v3, v4, v6 :: v_dual_fmamk_f32 v4, v5, 0x3c23d70a, v7
+; GFX11-NEXT:    v_dual_mul_f32 v1, v3, v1 :: v_dual_mul_f32 v2, v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v1, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, 0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
 .entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
index 36fa95c4c3ab5b..ff02c6ba698a3a 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
@@ -125,7 +125,62 @@ body:               |
   %2:sreg_32 = S_MOV_B32 1
   %3:sreg_32 = COPY %1:vgpr_32
   %4:sreg_32 = S_CSELECT_B32 killed %2:sreg_32, killed %3:sreg_32, implicit undef $scc
+...
+
+---
+# Test that the VGPR immediate is replaced with an SGPR one.
+# GCN-LABEL: name: reg_sequence_vgpr_immediate
+# GCN: [[A_SGPR:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+# GCN-NEXT: [[VGPR_CONST:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 37
+# GCN-NEXT: [[SGPR_CONST:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+# GCN-NEXT: {{%[0-9]+}}:sreg_64 = REG_SEQUENCE [[SGPR_CONST]], %subreg.sub0, [[A_SGPR]], %subreg.sub1
+name: reg_sequence_vgpr_immediate
+body:             |
+  bb.0:
+    %0:sreg_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_MOV_B32_e32 37, implicit $exec
+    %2:sreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, %0:sreg_32, %subreg.sub1
+
+    %3:vgpr_32 = V_ADD_U32_e32 %1:vgpr_32, %1:vgpr_32, implicit $exec
+...
+
+---
+# GCN-LABEL: name: insert_subreg_vgpr_immediate
+# GCN: [[DST:%[0-9]+]]:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr0, %subreg.sub2
+# GCN-NEXT: [[SGPR_CONST:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+# GCN-NEXT: {{%[0-9]+}}:sgpr_128 = INSERT_SUBREG [[DST]], [[SGPR_CONST]], %subreg.sub3
+name: insert_subreg_vgpr_immediate
+body:             |
+  bb.0:
+    %0:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr0, %subreg.sub2
+    %1:vgpr_32 = V_MOV_B32_e32 43, implicit $exec
+    %2:sgpr_128 = INSERT_SUBREG %0, %1, %subreg.sub3
+...
+
 ---
+# GCN-LABEL: name: phi_vgpr_immediate
+# GCN: bb.1:
+# GCN: [[SGPR:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+# GCN: bb.2:
+# GCN: IMPLICIT_DEF
+# GCN: bb.3:
+# GCN: sreg_32 = PHI [[SGPR]], %bb.1
+name: phi_vgpr_immediate
+tracksRegLiveness: true
+body:               |
+  bb.0:
+    S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+
+  bb.1:
+    %0:vgpr_32 = V_MOV_B32_e32 51, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    %1:sreg_32 = IMPLICIT_DEF
+    S_BRANCH %bb.3
+
+  bb.3:
+    %2:sreg_32 = PHI %0:vgpr_32, %bb.1, %1:sreg_32, %bb.2
 
 ---
 name:            cmp_f32
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 23971f2b681cb0..e8423ce9fbc36a 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 
 declare half @llvm.fma.f16(half, half, half)
 declare half @llvm.maxnum.f16(half, half)
@@ -19,6 +21,12 @@ define half @test_fma(half %x, half %y, half %z) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fma:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half %z)
   ret half %r
 }
@@ -36,6 +44,12 @@ define half @test_fmac(half %x, half %y, half %z) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmac_f16_e32 v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmac:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fmac_f16_e32 v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %y, half %z, half %x)
   ret half %r
 }
@@ -61,6 +75,12 @@ define half @test_fmaak(half %x, half %y, half %z) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmaak:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
   ret half %r
 }
@@ -86,6 +106,12 @@ define half @test_fmamk(half %x, half %y, half %z) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmamk:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
   ret half %r
 }
@@ -139,6 +165,33 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX10-GISEL-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_D139469_f16:
+; GFX11-SDAG:       ; %bb.0: ; %bb
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x211e
+; GFX11-SDAG-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-SDAG-NEXT:    v_min_f16_e32 v0, v2, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_D139469_f16:
+; GFX11-GISEL:       ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x291e
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v1, 0x291e, v0
+; GFX11-GISEL-NEXT:    v_fmaak_f16 v0, s0, v0, 0x211e
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract half %arg, 0xH291E
   %i1 = fcmp olt half %i, 0xH0000
@@ -213,6 +266,44 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX10-GISEL-NEXT:    s_or_b32 s4, s6, s5
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_D139469_v2f16:
+; GFX11-SDAG:       ; %bb.0: ; %bb
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x211e
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_D139469_v2f16:
+; GFX11-GISEL:       ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0x291e291e
+; GFX11-GISEL-NEXT:    v_pk_mul_f16 v1, v0, 0x291e op_sel_hi:[1,0]
+; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v0, s0, 0x211e op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s1, 0, v2
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s2, 0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s0, s1, s2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
   %i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000>
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 40ebb191802a89..70d915df7cb007 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; TODO: LLVM ERROR: cannot select: G_STORE %30:vgpr(s64), %22:vgpr(p1)
-; RUN: not --crash llc -march=amdgcn -global-isel=1 -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-SDAG %s
+; RUN: llc -march=amdgcn -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
@@ -10,22 +9,39 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL %s
 
 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -104,22 +120,39 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -199,22 +232,39 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -294,22 +344,39 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -389,23 +456,41 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; SI-NEXT:    v_min_f32_e32 v2, 2.0, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, 4.0, v2
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, 2.0, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, 4.0, v2
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, 2.0, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -488,26 +573,47 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_max_f32_e32 v2, 2.0, v2
-; SI-NEXT:    v_min_f32_e32 v3, 4.0, v2
-; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, 2.0, v2
+; SI-SDAG-NEXT:    v_min_f32_e32 v3, 4.0, v2
+; SI-SDAG-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v3, 2.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -639,23 +745,41 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
 }
 
 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_r_i_i_f64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; SI-NEXT:    v_max_f64 v[2:3], v[2:3], 2.0
-; SI-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], 2.0
+; SI-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; SI-SDAG-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f64:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], 2.0
+; SI-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
 ; VI-SDAG:       ; %bb.0:
@@ -739,21 +863,37 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
 }
 
 define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
-; SI-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -827,22 +967,40 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_legacy_fmed3_r_i_i_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_max_legacy_f32_e64 v2, v2, 2.0
+; SI-GISEL-NEXT:    v_min_legacy_f32_e64 v2, v2, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -963,29 +1121,52 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
 ; VI-SDAG:       ; %bb.0:
@@ -1130,29 +1311,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, -v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, -v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, -1.0, v3
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
 ; VI-SDAG:       ; %bb.0:
@@ -1297,29 +1501,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, -v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, -v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, -1.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
 ; VI-SDAG:       ; %bb.0:
@@ -1464,29 +1691,53 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -v2, |v3|, -|v4|
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -v2, |v3|, -|v4|
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e64 v4, -1.0, |v4|
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, |v3|, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
 ; VI-SDAG:       ; %bb.0:
@@ -1640,29 +1891,54 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -|v2|, -|v3|, -|v4|
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -|v2|, -|v3|, -|v4|
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e64 v2, -1.0, |v2|
+; SI-GISEL-NEXT:    v_mul_f32_e64 v3, -1.0, |v3|
+; SI-GISEL-NEXT:    v_mul_f32_e64 v4, -1.0, |v4|
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
 ; VI-SDAG:       ; %bb.0:
@@ -1821,32 +2097,57 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
 }
 
 define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -1971,29 +2272,51 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
 }
 
 define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_input_calls_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -2101,29 +2424,51 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
 }
 
 define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_call_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_call_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -2231,29 +2576,51 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_fast_call_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_fast_call_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -2373,29 +2740,51 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ; + commute outermost max
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -2503,29 +2892,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat1:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
 ; VI-SDAG:       ; %bb.0:
@@ -2633,29 +3044,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
 ; VI-SDAG:       ; %bb.0:
@@ -2800,29 +3234,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat2:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
 ; VI-SDAG:       ; %bb.0:
@@ -2930,29 +3386,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat3:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
 ; VI-SDAG:       ; %bb.0:
@@ -3060,29 +3538,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat4:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
 ; VI-SDAG:       ; %bb.0:
@@ -3190,29 +3690,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat5:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
 ; VI-SDAG:       ; %bb.0:
@@ -3320,29 +3842,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat6:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
 ; VI-SDAG:       ; %bb.0:
@@ -3450,29 +3994,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat7:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
 ; VI-SDAG:       ; %bb.0:
@@ -3580,29 +4146,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat8:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
 ; VI-SDAG:       ; %bb.0:
@@ -3710,29 +4298,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat9:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
 ; VI-SDAG:       ; %bb.0:
@@ -3840,29 +4450,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat10:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
 ; VI-SDAG:       ; %bb.0:
@@ -3970,29 +4602,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat11:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
 ; VI-SDAG:       ; %bb.0:
@@ -4100,29 +4754,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat12:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
 ; VI-SDAG:       ; %bb.0:
@@ -4230,29 +4906,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat13:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
 ; VI-SDAG:       ; %bb.0:
@@ -4360,29 +5058,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat14:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
 ; VI-SDAG:       ; %bb.0:
@@ -4490,29 +5210,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat15:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
 ; VI-SDAG:       ; %bb.0:
@@ -4623,29 +5365,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
 ; min(max(x, y), max(min(x, y), z))
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat16:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
 ; VI-SDAG:       ; %bb.0:
@@ -4757,38 +5521,70 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
 ; ---------------------------------------------------------------------
 
 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_min_f32_e32 v5, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s10, -1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-SDAG-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    buffer_store_dword v5, off, s[8:11], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_mov_b32 s3, s11
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
 ; VI-SDAG:       ; %bb.0:
@@ -4927,38 +5723,70 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_min_f32_e32 v5, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s10, -1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-SDAG-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[8:11], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_mov_b32 s3, s11
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
 ; VI-SDAG:       ; %bb.0:
@@ -5121,38 +5949,70 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_min_f32_e32 v5, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s10, -1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-SDAG-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[8:11], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_mov_b32 s3, s11
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
 ; VI-SDAG:       ; %bb.0:
@@ -5291,34 +6151,62 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_safe_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_min_f32_e32 v5, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT:    v_min_f32_e32 v2, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-SDAG-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -5469,32 +6357,57 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -5619,32 +6532,57 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -5769,32 +6707,57 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -5919,29 +6882,52 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
 ; VI-SDAG:       ; %bb.0:
@@ -6086,31 +7072,57 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_min_f32_e64 v5, -v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_min_f32_e64 v5, -v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, -1.0, v2
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v5, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
 ; VI-SDAG:       ; %bb.0:
@@ -6272,30 +7284,53 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
 
 ; A simple min and max is not sufficient
 define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_min_max_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_min_max_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_min_max_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -6404,24 +7439,51 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
 }
 
 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-SDAG-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, 1.0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v4, 2.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, 4.0
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
 ; VI-SDAG:       ; %bb.0:
@@ -6502,36 +7564,83 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_med3_f16_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_ushort v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_ushort v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-SDAG-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, 2.0
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v5, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -6663,23 +7772,41 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
 }
 
 define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: two_non_inline_constant:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 0.5, v2
-; SI-NEXT:    v_max_f32_e32 v2, 0x41000000, v2
-; SI-NEXT:    v_min_f32_e32 v2, 0x41800000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: two_non_inline_constant:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 0.5, v2
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, 0x41000000, v2
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, 0x41800000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: two_non_inline_constant:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0.5, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, 0x41000000, v2
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, 0x41800000, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: two_non_inline_constant:
 ; VI-SDAG:       ; %bb.0:
@@ -6779,27 +7906,49 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
 
 ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
 define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: one_non_inline_constant:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v4, 0.5, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x41800000, v2
-; SI-NEXT:    v_med3_f32 v3, v4, 1.0, v3
-; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: one_non_inline_constant:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 0.5, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 0x41800000, v2
+; SI-SDAG-NEXT:    v_med3_f32 v3, v4, 1.0, v3
+; SI-SDAG-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: one_non_inline_constant:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0x41800000
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 0.5, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x41800000, v2
+; SI-GISEL-NEXT:    v_med3_f32 v3, v3, 1.0, s4
+; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: one_non_inline_constant:
 ; VI-SDAG:       ; %bb.0:
@@ -6912,31 +8061,57 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 }
 
 define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: two_non_inline_constant_multi_use:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s4, 0x41000000
-; SI-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v4, 0.5, v2
-; SI-NEXT:    v_add_f32_e32 v5, 0x41800000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x41000000, v2
-; SI-NEXT:    v_med3_f32 v3, v4, s4, v3
-; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: two_non_inline_constant_multi_use:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x41000000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 0.5, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v5, 0x41800000, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 0x41000000, v2
+; SI-SDAG-NEXT:    v_med3_f32 v3, v4, s4, v3
+; SI-SDAG-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    buffer_store_dword v5, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: two_non_inline_constant_multi_use:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0x41000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 0.5, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, 0x41800000, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x41000000, v2
+; SI-GISEL-NEXT:    v_med3_f32 v3, v4, s4, v3
+; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    buffer_store_dword v5, off, s[4:7], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: two_non_inline_constant_multi_use:
 ; VI-SDAG:       ; %bb.0:
@@ -7085,3 +8260,5 @@ declare half @llvm.maxnum.f16(half, half) #0
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
 attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index a8709c8a9a7c0f..c5a430823ecf33 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=0 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
-; TODO: Crashes on selecting G_STORE.
-; RUN: not --crash llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=1 -verify-machineinstrs -enable-unsafe-fp-math < %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=1 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
@@ -28,6 +27,18 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_f32_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s3
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_f32_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -150,6 +161,19 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_f64_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_f64_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -282,6 +306,21 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s4
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s5
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -311,10 +350,10 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-GISEL-NEXT:    v_or_b32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
@@ -345,9 +384,10 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
@@ -381,10 +421,11 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -421,6 +462,23 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
+; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -448,15 +506,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-GISEL-NEXT:    v_or_b32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
@@ -486,14 +544,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
@@ -528,6 +587,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
@@ -536,8 +597,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -569,6 +629,18 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -s3
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -691,6 +763,18 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s3|
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -813,6 +897,18 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -|s3|
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -936,6 +1032,18 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s3
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -1063,6 +1171,18 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s3|
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -1192,6 +1312,19 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s3
+; SI-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
index 31bc2b04dde8c1..d8736c5276a26d 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -53,7 +53,7 @@ body:             |
     ; GCN-LABEL: name: func_add_constant_to_fi_uniform_i32
     ; GCN: liveins: $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr0 = S_LSHR_B32 6, $sgpr32, implicit-def dead $scc
+    ; GCN-NEXT: $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_I32 killed $sgpr0, 4, implicit-def dead $scc
     ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec
     ; GCN-NEXT: $m0 = S_MOV_B32 -1
@@ -89,7 +89,7 @@ body:             |
     ; GCN-LABEL: name: func_add_constant_to_fi_uniform_SCC_clobber_i32
     ; GCN: liveins: $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr0 = S_LSHR_B32 6, $sgpr32, implicit-def dead $scc
+    ; GCN-NEXT: $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 killed $sgpr0, 4, implicit-def $scc
     ; GCN-NEXT: renamable $sgpr5 = S_ADDC_U32 $sgpr4, 1234567, implicit-def $scc, implicit $scc
     ; GCN-NEXT: $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index ccde5efce08dc8..56f72ac9d9e8c6 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -2823,18 +2823,18 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0001
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2843,21 +2843,20 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0001
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v3), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0001
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, s2, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                           ptr addrspace(1) %src2,
                                           ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index ea22aaee761c8d..5c44ba008df04e 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32:
@@ -117,16 +118,36 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s2
-; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -169,8 +190,6 @@ entry:
   ret void
 }
 
-; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should
-; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern.
 define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc16:
 ; GFX7:       ; %bb.0: ; %entry
@@ -294,33 +313,14 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_bfe_i32 v8, v8, 0, 8
-; GFX9-DL-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_sshort v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX9-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
-; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v0, v2, v3, v4
+; GFX9-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc16:
@@ -329,35 +329,34 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX10-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_sshort v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_bfe_i32 v4, v8, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v7, v9, 0, 8
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
-; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
-; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v4, v2, v3
+; GFX10-DL-NEXT:    global_store_short v1, v4, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc16:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_i16 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -499,25 +498,14 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
-; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc8:
@@ -532,21 +520,28 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v4, v2, v3, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -692,14 +687,9 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 8
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 8
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v4
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, s0
-; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -717,17 +707,36 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 8
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v5, v0, v3
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_multiuse_mul1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_mad_i32_i24 v2, v2, v3, s2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -772,7 +781,6 @@ entry:
   ret void
 }
 
-; TODO: Support this pattern.
 define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
@@ -879,17 +887,8 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_add3_u32 v2, v5, s0, v3
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s0
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -898,25 +897,36 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1137,6 +1147,53 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc16_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 8, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v0, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v1, 0, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 8, v1
+; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 8, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1165,4 +1222,2215 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_2ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v3, s4
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_2ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_2ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, s0, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_2ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_2ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0100
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_2ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  store i32 %add2, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v4, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3ele_permuted:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 24, v2
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 24, v0
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3ele_permuted:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 24, v3
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
+; GFX8-NEXT:    v_bfe_i32 v5, v0, 0, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v3, 24, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v4, 24, v2
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020003
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020003
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020003
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020003
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020003
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 3
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_opt:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
+; GFX7-NEXT:    v_mul_i32_i24_e32 v3, v3, v6
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, v3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_opt:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v7, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v5, v2, 0, 8
+; GFX8-NEXT:    v_mul_i32_i24_sdwa v6, sext(v3), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT:    v_bfe_i32 v8, v2, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v4, v4, v5, v6
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX8-NEXT:    v_mad_i32_i24 v4, v7, v8, v4
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v2, v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_opt:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_opt:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, 0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_opt:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_opt:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e3 = sext i8 %v1e3 to i32
+  %v2e3 = extractelement <4 x i8> %vec2, i64 3
+  %cv2e3 = sext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add2 = add i32 %mul1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3src:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v6, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v6, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3src:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
+; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 8
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v6, v0, 16, 8
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX8-NEXT:    v_mad_i32_i24 v1, v5, v6, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3src:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3src:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v3, v1, s1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3src:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v1, v3, v0
+; GFX10-DL-NEXT:    global_store_dword v2, v1, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3src:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0x706010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = sext i8 %v3e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
+
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e3 = sext i8 %v1e3 to i32
+  %v3e3 = extractelement <4 x i8> %vec3, i64 3
+  %cv3e3 = sext i8 %v3e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv3e3
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3src_3ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3src_3ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3src_3ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3src_3ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3src_3ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
+; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0xc020100
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v1, v2, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3src_3ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc06010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = sext i8 %v3e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_bad_source:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
+; GFX7-NEXT:    s_sext_i32_i16 s5, s12
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v3, s5, v1
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_bad_source:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_sext_i32_i16 s2, s2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v2, s2, v1
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_bad_source:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, s2, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v4, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_bad_source:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0201
+; GFX9-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s4
+; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v4, s2, v3
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, v3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_bad_source:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0201
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, s2, s3
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v0, v1, v2
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_bad_source:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX11-DL-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0201
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_i32_i24 v2, v2, s2, s3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       i16 %badsource,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %other = sext i16 %badsource to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %other
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_commutative:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v4, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_commutative:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_commutative:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_commutative:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_commutative:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_commutative:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x3c
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = sext i8 %v1e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v4, v2, v2, s0
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v2, v4
+; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v1, v4, 8, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v4, 16, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v4, v1, v1, s0
+; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 8, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c01
+; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0xc020101
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v1, v2, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v2, 0xc06010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c01
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020101
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v2e0 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e0 = sext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv2e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = sext i8 %v3e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = sext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv2e2, %cv3e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_4src:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v5, v3, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v2, v4, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v3, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX7-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
+; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_4src:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s10, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v3, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v6, v5, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v5, v5, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v5, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v7, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX8-NEXT:    v_mad_i32_i24 v2, v7, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_4src:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_4src:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0501
+; GFX9-DL-NEXT:    s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0400
+; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s5, 0x4000c0c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT:    v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT:    v_perm_b32 v2, v4, v3, s5
+; GFX9-DL-NEXT:    v_or_b32_e32 v3, v6, v5
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v3, s6
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_4src:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x3
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc0c0501
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v5, v4, v3, 0x5010c0c
+; GFX10-DL-NEXT:    v_perm_b32 v2, v4, v3, 0x4000c0c
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX10-DL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    v_dot4c_i32_i8_e32 v2, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_4src:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x3
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v3, v0, s[8:9]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[10:11]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v1, 0xc0c0501
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v0, v3, 0x5010c0c
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v3, 0x4000c0c
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX11-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) %src4,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+  %gep4 = getelementptr <4 x i8>, ptr addrspace(1) %src4, i32 %idx
+  %vec4 = load <4 x i8>, ptr addrspace(1) %gep4
+
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = sext i8 %v1e0 to i32
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = sext i8 %v1e1 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e1
+
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = sext i8 %v2e0 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = sext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv2e0, %cv2e1
+
+  %v3e0 = extractelement <4 x i8> %vec3, i64 0
+  %cv3e0 = sext i8 %v3e0 to i32
+  %v3e1 = extractelement <4 x i8> %vec3, i64 1
+  %cv3e1 = sext i8 %v3e1 to i32
+  %mul3 = mul nuw nsw i32 %cv3e0, %cv3e1
+
+  %v4e0 = extractelement <4 x i8> %vec4, i64 0
+  %cv4e0 = sext i8 %v4e0 to i32
+  %v4e1 = extractelement <4 x i8> %vec4, i64 1
+  %cv4e1 = sext i8 %v4e1 to i32
+  %mul4 = mul nuw nsw i32 %cv4e0, %cv4e1
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+  %mad4 = add i32 %mad3, %mul4
+
+  store i32 %mad4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_nonstandard_signed:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_mul_u32_u24_e32 v1, v1, v5
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_nonstandard_signed:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_mad_u16 v6, v8, v7, v6
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX8-NEXT:    v_mad_u16 v4, v4, v5, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
+; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_nonstandard_signed:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v4, v6, v5, v4
+; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v7, v3, v4
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_nonstandard_signed:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX9-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX9-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v4, v6, v5, v4
+; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v3, v4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_nonstandard_signed:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v6, 0xff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v3
+; GFX10-DL-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_bfe_i32 v6, v7, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v3, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v6, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v1, v2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_nonstandard_signed:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_mul_lo_u16 v2, v2, v3
+; GFX11-DL-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_bfe_i32 v5, v6, 0, 8
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mad_u16 v2, v4, v3, v2
+; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v2, v6, v5, v2
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %v1e0e = sext i8 %v1e0 to i16
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %v2e0e = zext i8 %v2e0 to i16
+  %mul0 = mul nsw i16 %v1e0e, %v2e0e
+  %add0 = add i16 %mul0, 0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %v1e1e = sext i8 %v1e1 to i16
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %v2e1e = zext i8 %v2e1 to i16
+  %mul1 = mul nsw i16 %v2e1e, %v1e1e
+  %add1 = add i16 %mul1, %add0
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %v1e2e = sext i8 %v1e2 to i16
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %v2e2e = zext i8 %v2e2 to i16
+  %mul2 = mul nsw i16 %v2e2e, %v1e2e
+  %add2 = add i16 %mul2, %add1
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %v1e3e = sext i8 %v1e3 to i16
+  %v2e3 = extractelement <4 x i8> %vec2, i64 3
+  %v2e3e = zext i8 %v2e3 to i16
+  %mul3 = mul nsw i16 %v1e3e, %v2e3e
+  %add3 = add i16 %mul3, %add2
+  %res = sext i16 %add3 to i32
+  store i32 %res, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index b7821f8fd6da51..a82c5215f3b2c6 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32:
@@ -127,6 +128,24 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -279,30 +298,14 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
-; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc16:
@@ -311,32 +314,34 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v8, 0xff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
-; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
-; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
+; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc16:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -479,25 +484,14 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
-; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
+; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc8:
@@ -512,21 +506,28 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v4, v2, v3, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -562,7 +563,6 @@ entry:
   ret void
 }
 
-; TODO: Generate udot4?
 define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_8:
 ; GFX7:       ; %bb.0: ; %entry
@@ -644,19 +644,19 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc0c0100
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v2
-; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_8:
@@ -665,21 +665,44 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0100
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v2, v2, v3, v4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v2
-; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot2_8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                    ptr addrspace(1) %src2,
                                    ptr addrspace(1) nocapture %dst) {
 entry:
@@ -803,25 +826,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v6, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
-; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
+; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
@@ -836,21 +848,28 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v4, v3, v2, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v7, v6, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_CommutationInsideMAD:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                       ptr addrspace(1) %src2,
                                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -886,7 +905,6 @@ entry:
   ret void
 }
 
-; TODO: Support commutation accross the adds.
 define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_CommutationAccrossMADs:
 ; GFX7:       ; %bb.0: ; %entry
@@ -986,25 +1004,14 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v6, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
-; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
+; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
@@ -1019,21 +1026,28 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v4, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_CommutationAccrossMADs:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                         ptr addrspace(1) %src2,
                                                         ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1180,14 +1194,9 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
-; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1205,17 +1214,36 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xff, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v0, v3
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v2, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_multiuse_mul1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_mad_u32_u24 v2, v2, v3, s2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v2
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1369,18 +1397,12 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_u32 v4, v1, 8, 8
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 8, 8
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, s0
-; GFX9-DL-NEXT:    v_add_u32_e32 v4, s0, v2
-; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v3, v6
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v1, v4
+; GFX9-DL-NEXT:    s_add_i32 s1, s0, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
+; GFX9-DL-NEXT:    v_add3_u32 v1, s1, v3, v1
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1394,21 +1416,41 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 8, 8
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_bfe_u32 v3, v2, 8, 8
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s2, v0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v4, v3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
-; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
+; GFX10-DL-NEXT:    s_add_i32 s2, s2, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_add3_u32 v0, s2, v0, v1
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_multiuse_add1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_u32 v2, v1, 8, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_bfe_u32 v3, v0, 8, 8
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add3_u32 v0, s2, v2, v0
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1564,7 +1606,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc0c0302
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
@@ -1580,13 +1622,10 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_bfe_i32 v5, v2, 0, 8
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
-; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s0
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
 ; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1596,7 +1635,6 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v7, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
@@ -1608,20 +1646,52 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v8, v2, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0302
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0302
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
-; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v8, v3
-; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
-; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: notdot4_mixedtypes:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v0, 0, 8
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0302
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0302
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1665,7 +1735,7 @@ entry:
   ret void
 }
 
-; TODO: cleanup s_lshr_b32 and support this pattern.
+; TODO: cleanup s_lshr_b32
 define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
@@ -1767,14 +1837,8 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_add3_u32 v2, v3, s0, v4
-; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1787,22 +1851,30 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
+; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2016,6 +2088,51 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc16_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_lshrrev_b16 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b16 v5, 8, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v9
+; GFX11-DL-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2208,6 +2325,53 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc8_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b16 v8, 8, v1
+; GFX11-DL-NEXT:    v_lshrrev_b16 v9, 8, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
+; GFX11-DL-NEXT:    v_mul_lo_u16 v5, v5, v6
+; GFX11-DL-NEXT:    v_mul_lo_u16 v6, v4, v7
+; GFX11-DL-NEXT:    v_mul_lo_u16 v8, v8, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_or_b32_e32 v6, v6, v5
+; GFX11-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-DL-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v7, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                              ptr addrspace(1) %src2,
                                              ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2233,4 +2397,2229 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_2ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s4
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_2ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_2ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, s0, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_2ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_2ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0100
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_2ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  store i32 %add2, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_3ele_permuted:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_3ele_permuted:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020003
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020003
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020003
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020003
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020003
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 3
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul2
+  %add3 = add i32 %add2, %mul3
+  store i32 %add3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_opt:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GFX7-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_opt:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xff, v2
+; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_mad_u32_u24 v4, v7, v8, v4
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v2, v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_opt:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_opt:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_opt:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
+; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_opt:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <4 x i8> %vec2, i64 3
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %add2 = add i32 %mul1, %mul2
+  %add3 = add i32 %add2, %mul3
+  %add4 = add i32 %add3, %mul4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_acc32_3src:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_acc32_3src:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_acc32_3src:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_acc32_3src:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v3, v1, s1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_3src:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v0, s0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_3src:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0x706010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = zext i8 %v3e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
+
+  %v1e3 = extractelement <4 x i8> %vec1, i64 3
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v3e3 = extractelement <4 x i8> %vec3, i64 3
+  %cv3e3 = zext i8 %v3e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv3e3
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+  %mad4 = add i32 %mad3, %mul4
+
+  store i32 %mad4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_acc32_3src_3ele:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_acc32_3src_3ele:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_acc32_3src_3ele:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_acc32_3src_3ele:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_3src_3ele:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_3src_3ele:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc06010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = zext i8 %v3e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_bad_source:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
+; GFX7-NEXT:    s_and_b32 s5, s12, 0xffff
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, s5, v1
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_bad_source:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v2, s2, v1
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_bad_source:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, s2, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v4, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_bad_source:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0201
+; GFX9-DL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s4
+; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v4, s2, v3
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_bad_source:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xff, v1
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0201
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, s2, s3
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v2, v0
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_bad_source:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-DL-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0201
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u32_u24 v2, v2, s2, s3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v2
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       i16 %badsource,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %other = zext i16 %badsource to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %other
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_commutative:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_commutative:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_commutative:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_commutative:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_commutative:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_commutative:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x3c
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %v1e2 = extractelement <4 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_u32 v1, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v4, v2, v2, s0
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_u32 v1, v4, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v3, v4, 16, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v4, v1, v1, s0
+; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_bfe_u32 v4, v1, 8, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x2
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c01
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020101
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x2
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v2, 0xc06010c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c01
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020101
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+
+  %v2e0 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv2e0, %cv2e0
+
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v3e2 = extractelement <4 x i8> %vec3, i64 2
+  %cv3e2 = zext i8 %v3e2 to i32
+  %v2e2 = extractelement <4 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv2e2, %cv3e2
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+
+  store i32 %mad3, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_4src:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, s3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x11
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v3
+; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX7-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v3, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_4src:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s10, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v5
+; GFX8-NEXT:    v_bfe_u32 v5, v5, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v5, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xff, v0
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v2, v7, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_4src:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_4src:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0501
+; GFX9-DL-NEXT:    s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX9-DL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0400
+; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s5, 0x4000c0c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT:    v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT:    v_perm_b32 v2, v4, v3, s5
+; GFX9-DL-NEXT:    v_or_b32_e32 v3, v6, v5
+; GFX9-DL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v3, s6
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_4src:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x3
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[8:9]
+; GFX10-DL-NEXT:    global_load_dword v4, v0, s[10:11]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc0c0501
+; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v5, v4, v3, 0x5010c0c
+; GFX10-DL-NEXT:    v_perm_b32 v2, v4, v3, 0x4000c0c
+; GFX10-DL-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX10-DL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_4src:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x3
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_b32 v3, v0, s[8:9]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[10:11]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v1, 0xc0c0501
+; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v0, v3, 0x5010c0c
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v3, 0x4000c0c
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX11-DL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, s2
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) %src3,
+                                       ptr addrspace(1) %src4,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
+  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
+  %gep4 = getelementptr <4 x i8>, ptr addrspace(1) %src4, i32 %idx
+  %vec4 = load <4 x i8>, ptr addrspace(1) %gep4
+
+
+  %v1e0 = extractelement <4 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v1e1 = extractelement <4 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e1
+
+  %v2e0 = extractelement <4 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %v2e1 = extractelement <4 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv2e0, %cv2e1
+
+  %v3e0 = extractelement <4 x i8> %vec3, i64 0
+  %cv3e0 = zext i8 %v3e0 to i32
+  %v3e1 = extractelement <4 x i8> %vec3, i64 1
+  %cv3e1 = zext i8 %v3e1 to i32
+  %mul3 = mul nuw nsw i32 %cv3e0, %cv3e1
+
+  %v4e0 = extractelement <4 x i8> %vec4, i64 0
+  %cv4e0 = zext i8 %v4e0 to i32
+  %v4e1 = extractelement <4 x i8> %vec4, i64 1
+  %cv4e1 = zext i8 %v4e1 to i32
+  %mul4 = mul nuw nsw i32 %cv4e0, %cv4e1
+
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad1 = add i32 %mul1, %acc
+  %mad2 = add i32 %mad1, %mul2
+  %mad3 = add i32 %mad2, %mul3
+  %mad4 = add i32 %mad3, %mul4
+
+  store i32 %mad4, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
+; GFX7-LABEL: udot4_acc32_multi:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT:    v_bfe_u32 v7, v2, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v3
+; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v8, v1
+; GFX7-NEXT:    v_bfe_u32 v11, v3, 16, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v4, v1
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v11, v8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
+; GFX7-NEXT:    v_bfe_u32 v10, v3, 8, 8
+; GFX7-NEXT:    v_mad_u32_u24 v1, v2, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_mad_u32_u24 v1, v10, v6, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot4_acc32_multi:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[2:3]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 8
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v3, v3, v4, s2
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v1
+; GFX8-NEXT:    v_mad_u32_u24 v3, v7, v8, v3
+; GFX8-NEXT:    v_bfe_u32 v11, v1, 16, 8
+; GFX8-NEXT:    v_mad_u32_u24 v3, v9, v4, v3
+; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v3, v11, v8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_mad_u32_u24 v3, v5, v6, v3
+; GFX8-NEXT:    v_bfe_u32 v10, v1, 8, 8
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX8-NEXT:    v_mad_u32_u24 v0, v10, v6, v0
+; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: udot4_acc32_multi:
+; GFX9-NODL:       ; %bb.0: ; %entry
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    global_load_dword v3, v2, s[6:7]
+; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GFX9-NODL-NEXT:    v_bfe_u32 v6, v3, 16, 8
+; GFX9-NODL-NEXT:    v_bfe_u32 v5, v3, 8, 8
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v7, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v9, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_add3_u32 v3, v7, s0, v9
+; GFX9-NODL-NEXT:    v_add3_u32 v3, v3, v4, v6
+; GFX9-NODL-NEXT:    v_add3_u32 v0, v8, v3, v0
+; GFX9-NODL-NEXT:    v_add3_u32 v0, v0, v5, v1
+; GFX9-NODL-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot4_acc32_multi:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-DL-NEXT:    s_mov_b32 s0, 0x6040200
+; GFX9-DL-NEXT:    s_mov_b32 s1, 0x2000200
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v2, s[6:7]
+; GFX9-DL-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0x7050301
+; GFX9-DL-NEXT:    s_mov_b32 s6, 0x3010301
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v4, v1, v0, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v5, v3, v3, s1
+; GFX9-DL-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v4, v5, s5
+; GFX9-DL-NEXT:    v_perm_b32 v3, v3, v3, s6
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v0, v3, v1
+; GFX9-DL-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_multi:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v2, s[6:7]
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v2, v1, v0, 0x6040200
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v4, v3, v3, 0x2000200
+; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v0, 0x7050301
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v2, v4, s2
+; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0x3010301
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v1
+; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_multi:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b64 v[0:1], v2, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v2, v2, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_perm_b32 v3, v1, v0, 0x6040200
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v2, 0x2000200
+; GFX11-DL-NEXT:    v_perm_b32 v0, v1, v0, 0x7050301
+; GFX11-DL-NEXT:    v_perm_b32 v2, v2, v2, 0x3010301
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v1, v3, v4, s2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v1
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_nop 0
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+  %v1e0 = extractelement <8 x i8> %vec1, i64 0
+  %cv1e0 = zext i8 %v1e0 to i32
+  %v2e0 = extractelement <8 x i8> %vec2, i64 0
+  %cv2e0 = zext i8 %v2e0 to i32
+  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+  %v1e1 = extractelement <8 x i8> %vec1, i64 1
+  %cv1e1 = zext i8 %v1e1 to i32
+  %v2e1 = extractelement <8 x i8> %vec2, i64 1
+  %cv2e1 = zext i8 %v2e1 to i32
+  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+  %v1e2 = extractelement <8 x i8> %vec1, i64 2
+  %cv1e2 = zext i8 %v1e2 to i32
+  %v2e2 = extractelement <8 x i8> %vec2, i64 2
+  %cv2e2 = zext i8 %v2e2 to i32
+  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+  %v1e3 = extractelement <8 x i8> %vec1, i64 3
+  %cv1e3 = zext i8 %v1e3 to i32
+  %v2e3 = extractelement <8 x i8> %vec2, i64 3
+  %cv2e3 = zext i8 %v2e3 to i32
+  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+  %v1e4 = extractelement <8 x i8> %vec1, i64 4
+  %cv1e4 = zext i8 %v1e4 to i32
+  %v2e4 = extractelement <8 x i8> %vec2, i64 4
+  %cv2e4 = zext i8 %v2e4 to i32
+  %mul5 = mul nuw nsw i32 %cv1e4, %cv2e0
+
+  %v1e5 = extractelement <8 x i8> %vec1, i64 5
+  %cv1e5 = zext i8 %v1e5 to i32
+  %v2e5 = extractelement <8 x i8> %vec2, i64 5
+  %cv2e5 = zext i8 %v2e5 to i32
+  %mul6 = mul nuw nsw i32 %cv1e5, %cv2e1
+
+  %v1e6 = extractelement <8 x i8> %vec1, i64 6
+  %cv1e6 = zext i8 %v1e6 to i32
+  %v2e6 = extractelement <8 x i8> %vec2, i64 6
+  %cv2e6 = zext i8 %v2e6 to i32
+  %mul7 = mul nuw nsw i32 %cv1e6, %cv2e2
+
+  %v1e7 = extractelement <8 x i8> %vec1, i64 7
+  %cv1e7 = zext i8 %v1e7 to i32
+  %v2e7 = extractelement <8 x i8> %vec2, i64 7
+  %cv2e7 = zext i8 %v2e7 to i32
+  %mul8 = mul nuw nsw i32 %cv1e7, %cv2e3
+
+  %acc = load i32, ptr addrspace(1) %dst, align 4
+  %mad11 = add i32 %mul1, %acc
+  %mad21 = add i32 %mad11, %mul3
+  %mad31 = add i32 %mad21, %mul5
+  %mad41 = add i32 %mad31, %mul7
+  %mad12 = add i32 %mul2, %mad41
+  %mad22 = add i32 %mad12, %mul4
+  %mad32 = add i32 %mad22, %mul6
+  %mad42 = add i32 %mad32, %mul8
+
+  store i32 %mad42, ptr addrspace(1) %dst, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
index 226670a550014c..d4ad53291070d7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
@@ -33,7 +33,7 @@ define amdgpu_kernel void @mad_f16_imm_a(
 }
 
 ; GCN-LABEL: {{^}}mad_f16_imm_b:
-; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
+; GCN: v_madmk_f16 {{v[0-9]+}}, {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
 define amdgpu_kernel void @mad_f16_imm_b(
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index f90d338ffc4873..1926d0f8f0d6fc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -256,8 +256,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mac_f32_e32 v1, 0x40400000, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; SI-NEXT:    v_madmk_f32 v0, v0, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -280,8 +280,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    s_mov_b32 s0, s4
 ; VI-FLUSH-NEXT:    s_mov_b32 s1, s5
-; VI-FLUSH-NEXT:    v_mac_f16_e32 v1, 0x4200, v0
-; VI-FLUSH-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; VI-FLUSH-NEXT:    v_madmk_f16 v0, v0, 0x4200, v1
+; VI-FLUSH-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-FLUSH-NEXT:    s_endpgm
 ;
 ; VI-DENORM-LABEL: fmuladd_f16_imm_a:
@@ -353,8 +353,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    s_mov_b32 s0, s4
 ; GFX10-DENORM-NEXT:    s_mov_b32 s1, s5
-; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v1, 0x4200, v0
-; GFX10-DENORM-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; GFX10-DENORM-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v1
+; GFX10-DENORM-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a:
@@ -442,8 +442,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mac_f32_e32 v1, 0x40400000, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; SI-NEXT:    v_madmk_f32 v0, v0, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -466,8 +466,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    s_mov_b32 s0, s4
 ; VI-FLUSH-NEXT:    s_mov_b32 s1, s5
-; VI-FLUSH-NEXT:    v_mac_f16_e32 v1, 0x4200, v0
-; VI-FLUSH-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; VI-FLUSH-NEXT:    v_madmk_f16 v0, v0, 0x4200, v1
+; VI-FLUSH-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-FLUSH-NEXT:    s_endpgm
 ;
 ; VI-DENORM-LABEL: fmuladd_f16_imm_b:
@@ -539,8 +539,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    s_mov_b32 s0, s4
 ; GFX10-DENORM-NEXT:    s_mov_b32 s1, s5
-; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v1, 0x4200, v0
-; GFX10-DENORM-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; GFX10-DENORM-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v1
+; GFX10-DENORM-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index f55242a8726beb..528232a203acfe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -203,7 +203,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -2269,7 +2269,7 @@ define float @v_log_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -2472,7 +2472,7 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -2675,7 +2675,7 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -2878,7 +2878,7 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -3015,9 +3015,7 @@ define float @v_log_f32_fast(float %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3f317218, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_fast:
@@ -3135,9 +3133,7 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3f317218, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_unsafe_math_attr:
@@ -3255,9 +3251,7 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3f317218, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_approx_fn_attr:
@@ -3441,7 +3435,7 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -3577,9 +3571,7 @@ define float @v_log_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3f317218, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_afn:
@@ -3726,9 +3718,7 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3f317218, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_afn_dynamic:
@@ -3844,11 +3834,10 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, s0
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e64 v0, |v0|, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3f317218, v0
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_fabs_log_f32_afn:
@@ -3982,7 +3971,7 @@ define float @v_log_f32_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -4174,7 +4163,7 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -4324,7 +4313,7 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -4516,7 +4505,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -4666,7 +4655,7 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -4858,7 +4847,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -5041,9 +5030,9 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -5152,9 +5141,9 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_nnan_ninf_daz:
@@ -5322,9 +5311,9 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -5548,7 +5537,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -5722,7 +5711,7 @@ define float @v_log_f32_undef() {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -5887,7 +5876,7 @@ define float @v_log_f32_0() {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -6041,7 +6030,7 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -6212,7 +6201,7 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -6390,7 +6379,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index ab6325216c06d9..2e5bf2e5609512 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -203,7 +203,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -2269,7 +2269,7 @@ define float @v_log10_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -2472,7 +2472,7 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -2675,7 +2675,7 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -2878,7 +2878,7 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -3015,9 +3015,7 @@ define float @v_log10_f32_fast(float %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3e9a209b, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_fast:
@@ -3135,9 +3133,7 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3e9a209b, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_unsafe_math_attr:
@@ -3255,9 +3251,7 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3e9a209b, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_approx_fn_attr:
@@ -3441,7 +3435,7 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -3577,9 +3571,7 @@ define float @v_log10_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3e9a209b, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_afn:
@@ -3726,9 +3718,7 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3e9a209b, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_afn_dynamic:
@@ -3844,11 +3834,10 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, s0
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e64 v0, |v0|, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v1, 0x3e9a209b, v0
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_fabs_log10_f32_afn:
@@ -3982,7 +3971,7 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -4174,7 +4163,7 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -4324,7 +4313,7 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -4516,7 +4505,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -4666,7 +4655,7 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -4858,7 +4847,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -5041,9 +5030,9 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -5152,9 +5141,9 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_nnan_ninf_daz:
@@ -5322,9 +5311,9 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -5548,7 +5537,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
@@ -5722,7 +5711,7 @@ define float @v_log10_f32_undef() {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -5887,7 +5876,7 @@ define float @v_log10_f32_0() {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -6041,7 +6030,7 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -6212,7 +6201,7 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -6390,7 +6379,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
index 344ee62b440650..b97acaa8d7b35b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs --version 3
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-lower-ctor-dtor %s | FileCheck %s
 
 ; Make sure we emit code for constructor entries that aren't direct
@@ -16,29 +17,38 @@
 
 @foo.alias = hidden alias void (), ptr @foo
 
-;.
-; CHECK: @__init_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @llvm.used = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
-; CHECK: @foo.alias = hidden alias void (), ptr @foo
-;.
 define void @foo() {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:    ret void
-;
   ret void
 }
 
 define void @bar() addrspace(1) {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT:    ret void
-;
   ret void
 }
 
-; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.init()
+
+
+;.
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo.alias, ptr null }, { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), ptr null }]
+; CHECK: @[[LLVM_GLOBAL_DTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), ptr null }]
+; CHECK: @[[__INIT_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @[[FOO_ALIAS:[a-zA-Z0-9_$"\\.-]+]] = hidden alias void (), ptr @foo
+;.
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define void @bar(
+; CHECK-SAME: ) addrspace(1) #[[ATTR0]] {
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.init(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__init_array_start, ptr addrspace(1) @__init_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
 ; CHECK:       while.entry:
@@ -50,8 +60,10 @@ define void @bar() addrspace(1) {
 ; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
-
-; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini()
+;
+;
+; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
 ; CHECK:       while.entry:
@@ -63,6 +75,9 @@ define void @bar() addrspace(1) {
 ; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
-
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { "amdgpu-flat-work-group-size"="1,1" "device-init" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { "amdgpu-flat-work-group-size"="1,1" "device-fini" }
+;
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx900" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,1" "device-init" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,1" "device-fini" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
index 5c8e56dd93933d..aca5886bce5f78 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs --version 3
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-ctor-dtor < %s | FileCheck %s
 
@@ -11,39 +12,9 @@
 @llvm.global_ctors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
 @llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]
 
-; CHECK: @__init_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @llvm.used = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini]
 
-; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.init() #0
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__init_array_start, ptr addrspace(1) @__init_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.entry:
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ @__init_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
-; CHECK-NEXT:    [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
-; CHECK-NEXT:    call void [[CALLBACK]]()
-; CHECK-NEXT:    [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT:    [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__init_array_end
-; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    ret void
 
-; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini() #1
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.entry:
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
-; CHECK-NEXT:    [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
-; CHECK-NEXT:    call void [[CALLBACK]]()
-; CHECK-NEXT:    [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT:    [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
-; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    ret void
 
-; CHECK-NOT: amdgcn.device.
 
 ; VISIBILITY: FUNC   WEAK PROTECTED {{.*}} amdgcn.device.init
 ; VISIBILITY: OBJECT WEAK DEFAULT {{.*}} amdgcn.device.init.kd
@@ -73,5 +44,53 @@ define internal void @bar() {
   ret void
 }
 
-; CHECK: attributes #0 = { "amdgpu-flat-work-group-size"="1,1" "device-init" }
-; CHECK: attributes #1 = { "amdgpu-flat-work-group-size"="1,1" "device-fini" }
+;.
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
+; CHECK: @[[LLVM_GLOBAL_DTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]
+; CHECK: @[[__INIT_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+;.
+; CHECK-LABEL: define internal void @foo() {
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void @bar() {
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.init(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__init_array_start, ptr addrspace(1) @__init_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.entry:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ @__init_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
+; CHECK-NEXT:    [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT:    call void [[CALLBACK]]()
+; CHECK-NEXT:    [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
+; CHECK-NEXT:    [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__init_array_end
+; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.entry:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
+; CHECK-NEXT:    [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT:    call void [[CALLBACK]]()
+; CHECK-NEXT:    [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
+; CHECK-NEXT:    [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
+; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,1" "device-init" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,1" "device-fini" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 7a8a183ffbc92d..48f53309385812 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 --amdgpu-lower-module-lds-strategy=module < %s | FileCheck -check-prefix=GCN %s
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
@@ -31,19 +32,20 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GCN-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
-; CHECK-LABEL: @test(
+; CHECK-LABEL: define protected amdgpu_kernel void @test(
+; CHECK-SAME: ptr addrspace(1) nocapture [[PTR_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store i8 3, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, align 4, !alias.scope !1, !noalias !4
-; CHECK-NEXT:    tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds (%llvm.amdgcn.kernel.test.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope !6, !noalias !7
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.test.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope !4, !noalias !1
-; CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP4]], 3
+; CHECK-NEXT:    tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope !6, !noalias !7
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope !4, !noalias !1
+; CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 3
 ; CHECK-NEXT:    store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, align 4, !alias.scope !1, !noalias !4
-; CHECK-NEXT: tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds (%llvm.amdgcn.kernel.test.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope !6, !noalias !7
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.test.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope !4, !noalias !1
-; CHECK-NEXT:    [[CMP_I_I19:%.*]] = icmp eq i8 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[CMP_I_I19]], [[CMP_I_I]]
-; CHECK-NEXT:    [[FROMBOOL8:%.*]] = zext i1 [[TMP10]] to i8
-; CHECK-NEXT:    store i8 [[FROMBOOL8]], ptr addrspace(1) [[PTR_COERCE:%.*]], align 1
+; CHECK-NEXT:    tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope !6, !noalias !7
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope !4, !noalias !1
+; CHECK-NEXT:    [[CMP_I_I19:%.*]] = icmp eq i8 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = and i1 [[CMP_I_I19]], [[CMP_I_I]]
+; CHECK-NEXT:    [[FROMBOOL8:%.*]] = zext i1 [[TMP2]] to i8
+; CHECK-NEXT:    store i8 [[FROMBOOL8]], ptr addrspace(1) [[PTR_COERCE]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -64,7 +66,8 @@ entry:
 declare void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #1
 
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="7" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i64 0, i64 1}
 ; CHECK: [[META1:![0-9]+]] = !{!2}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
index 10064664aa991e..6e13cfb00a4b35 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -1,16 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
 
 @a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 @b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 
-; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa
-; CHECK: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, align 16, !tbaa !1, !noalias !6
-; CHECK: %val.a = load i32, ptr addrspace(3) %gep.a, align 4, !tbaa !1, !noalias !6
-; CHECK: store i32 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa !1, !noalias !6
-; CHECK: %val.b = load i32, ptr addrspace(3) %gep.b, align 4, !tbaa !1, !noalias !6
-
 define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(ptr addrspace(1) %arg, i32 %i) {
+; CHECK-LABEL: define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(
+; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, align 16, !tbaa [[TBAA1:![0-9]+]], !noalias !6
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 [[I]]
+; CHECK-NEXT:    [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !tbaa [[TBAA1]], !noalias !6
+; CHECK-NEXT:    store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa [[TBAA1]], !noalias !6
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), i32 0, i32 [[I]]
+; CHECK-NEXT:    [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !noalias !6
+; CHECK-NEXT:    [[VAL:%.*]] = add i32 [[VAL_A]], [[VAL_B]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr addrspace(1) [[ARG]], align 4
+; CHECK-NEXT:    ret void
+;
 bb:
   store i32 1, ptr addrspace(3) @a, align 4, !alias.scope !0, !noalias !3, !tbaa !5
   %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index d7697c903721b7..fe88b7770e09c7 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -10,19 +10,38 @@
 ; does not happen because when SILoadStoreOptimizer is run, the reads and writes
 ; are not adjacent. They are only moved later by MachineScheduler.
 
-; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
-; GCN: ds_write_b32
-; GCN: ds_write_b32
-; GCN: ds_read_b32
-; GCN: ds_read_b32
-
-; CHECK-LABEL: @no_clobber_ds_load_stores_x2
-; CHECK: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, align 16, !alias.scope !1, !noalias !4
-; CHECK: %val.a = load i32, ptr addrspace(3) %gep.a, align 4, !alias.scope !1, !noalias !4
-; CHECK: store i32 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, i32 0, i32 1), align 16, !alias.scope !4, !noalias !1
-; CHECK: %val.b = load i32, ptr addrspace(3) %gep.b, align 4, !alias.scope !4, !noalias !1
-
 define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i32 %i) {
+; CHECK-LABEL: define amdgpu_kernel void @no_clobber_ds_load_stores_x2(
+; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, align 16, !alias.scope !1, !noalias !4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, i32 0, i32 [[I]]
+; CHECK-NEXT:    [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !alias.scope !1, !noalias !4
+; CHECK-NEXT:    store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, i32 0, i32 1), align 16, !alias.scope !4, !noalias !1
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, i32 0, i32 1), i32 0, i32 [[I]]
+; CHECK-NEXT:    [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !alias.scope !4, !noalias !1
+; CHECK-NEXT:    [[VAL:%.*]] = add i32 [[VAL_A]], [[VAL_B]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr addrspace(1) [[ARG]], align 4
+; CHECK-NEXT:    ret void
+;
+; GCN-LABEL: no_clobber_ds_load_stores_x2:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 2
+; GCN-NEXT:    ds_write_b32 v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    ds_write_b32 v1, v2 offset:256
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:256
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
+; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-NEXT:    s_endpgm
 bb:
   store i32 1, ptr addrspace(3) @a, align 4
   %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i
@@ -35,24 +54,46 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
-; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_read_b32
-; GCN-DAG: ds_read_b32
-; GCN-DAG: ds_read_b32
-
-; CHECK-LABEL: @no_clobber_ds_load_stores_x3
-; CHECK: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, align 16, !alias.scope !6, !noalias !9
-; CHECK: %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 %i
-; CHECK: %val.a = load i32, ptr addrspace(3) %gep.a, align 4, !alias.scope !6, !noalias !9
-; CHECK: store i32 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 1), align 16, !alias.scope !12, !noalias !13
-; CHECK: %val.b = load i32, ptr addrspace(3) %gep.b, align 4, !alias.scope !12, !noalias !13
-; CHECK: store i32 3, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 2), align 16, !alias.scope !14, !noalias !15
-; CHECK: %val.c = load i32, ptr addrspace(3) %gep.c, align 4, !alias.scope !14, !noalias !15
-
 define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i32 %i) {
+; CHECK-LABEL: define amdgpu_kernel void @no_clobber_ds_load_stores_x3(
+; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, align 16, !alias.scope !6, !noalias !9
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 [[I]]
+; CHECK-NEXT:    [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !alias.scope !6, !noalias !9
+; CHECK-NEXT:    store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 1), align 16, !alias.scope !12, !noalias !13
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 1), i32 0, i32 [[I]]
+; CHECK-NEXT:    [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !alias.scope !12, !noalias !13
+; CHECK-NEXT:    store i32 3, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 2), align 16, !alias.scope !14, !noalias !15
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 2), i32 0, i32 [[I]]
+; CHECK-NEXT:    [[VAL_C:%.*]] = load i32, ptr addrspace(3) [[GEP_C]], align 4, !alias.scope !14, !noalias !15
+; CHECK-NEXT:    [[VAL_1:%.*]] = add i32 [[VAL_A]], [[VAL_B]]
+; CHECK-NEXT:    [[VAL:%.*]] = add i32 [[VAL_1]], [[VAL_C]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr addrspace(1) [[ARG]], align 4
+; CHECK-NEXT:    ret void
+;
+; GCN-LABEL: no_clobber_ds_load_stores_x3:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 2
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    ds_write_b32 v1, v2 offset:256
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    v_mov_b32_e32 v2, 3
+; GCN-NEXT:    ds_write_b32 v1, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    ds_write_b32 v1, v2 offset:512
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v3, v0 offset:256
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:512
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
+; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
+; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-NEXT:    s_endpgm
 bb:
   store i32 1, ptr addrspace(3) @a, align 4
   %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
index a5cc452a9c27ea..c999332cb65423 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
 ; Not reached by a non-kernel function and therefore not changed by this pass
@@ -33,7 +33,7 @@
 
 
 define amdgpu_kernel void @kernel_only() {
-; CHECK-LABEL: @kernel_only() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_only() {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0
 ; CHECK-NEXT:    store double 3.140000e+00, ptr addrspace(3) [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    ret void
@@ -45,7 +45,7 @@ define amdgpu_kernel void @kernel_only() {
 
 ; The accesses from functions are rewritten to go through the llvm.amdgcn.dynlds.offset.table
 define void @use_shared1() {
-; CHECK-LABEL: @use_shared1() {
+; CHECK-LABEL: define void @use_shared1() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[DYNAMIC_SHARED1:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED1]], align 4
@@ -60,7 +60,8 @@ define void @use_shared1() {
 }
 
 define void @use_shared2() #0 {
-; CHECK-LABEL: @use_shared2() #0 {
+; CHECK-LABEL: define void @use_shared2(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[DYNAMIC_SHARED2:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED2]], align 4
@@ -77,7 +78,8 @@ define void @use_shared2() #0 {
 ; Include a normal variable so that the new variables aren't all at the same absolute_symbol
 @static_shared = addrspace(3) global i32 poison
 define void @use_shared4() #0 {
-; CHECK-LABEL: @use_shared4() #0 {
+; CHECK-LABEL: define void @use_shared4(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    store i32 4, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
 ; CHECK-NEXT:    [[DYNAMIC_SHARED4:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
@@ -94,7 +96,8 @@ define void @use_shared4() #0 {
 }
 
 define void @use_shared8() #0 {
-; CHECK-LABEL: @use_shared8() #0 {
+; CHECK-LABEL: define void @use_shared8(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[DYNAMIC_SHARED8:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4
@@ -110,7 +113,7 @@ define void @use_shared8() #0 {
 
 ; The kernels are annotated with kernel.id and llvm.donothing use of the corresponding variable
 define amdgpu_kernel void @expect_align1() {
-; CHECK-LABEL: @expect_align1() !llvm.amdgcn.lds.kernel.id !2
+; CHECK-LABEL: define amdgpu_kernel void @expect_align1() !llvm.amdgcn.lds.kernel.id !2 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds) ]
 ; CHECK-NEXT:    call void @use_shared1()
 ; CHECK-NEXT:    ret void
@@ -120,7 +123,7 @@ define amdgpu_kernel void @expect_align1() {
 }
 
 define amdgpu_kernel void @expect_align2() {
-; CHECK-LABEL: @expect_align2() !llvm.amdgcn.lds.kernel.id !3
+; CHECK-LABEL: define amdgpu_kernel void @expect_align2() !llvm.amdgcn.lds.kernel.id !3 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds) ]
 ; CHECK-NEXT:    call void @use_shared2()
 ; CHECK-NEXT:    ret void
@@ -130,7 +133,8 @@ define amdgpu_kernel void @expect_align2() {
 }
 
 define amdgpu_kernel void @expect_align4() {
-; CHECK-LABEL: @expect_align4() #1 !llvm.amdgcn.lds.kernel.id !4 {
+; CHECK-LABEL: define amdgpu_kernel void @expect_align4(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id !4 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ]
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    call void @use_shared4()
@@ -142,7 +146,7 @@ define amdgpu_kernel void @expect_align4() {
 
 ; Use dynamic_shared directly too.
 define amdgpu_kernel void @expect_align8() {
-; CHECK-LABEL: @expect_align8() !llvm.amdgcn.lds.kernel.id !5 {
+; CHECK-LABEL: define amdgpu_kernel void @expect_align8() !llvm.amdgcn.lds.kernel.id !5 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9
 ; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
@@ -157,7 +161,8 @@ define amdgpu_kernel void @expect_align8() {
 
 ; Note: use_shared4 uses module.lds so this will allocate at offset 4
 define amdgpu_kernel void @expect_max_of_2_and_4() {
-; CHECK-LABEL: @expect_max_of_2_and_4() #1 !llvm.amdgcn.lds.kernel.id !6 {
+; CHECK-LABEL: define amdgpu_kernel void @expect_max_of_2_and_4(
+; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id !6 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ]
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    call void @use_shared2()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
index 7b534ab76f0e4d..cb34d48875d103 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
 
@@ -32,13 +33,22 @@
 
 ; Functions that are not called are ignored by the lowering
 define amdgpu_kernel void @call_func() {
+; CHECK-LABEL: define amdgpu_kernel void @call_func(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    call void @func()
+; CHECK-NEXT:    ret void
+;
   call void @func()
   ret void
 }
 
-; CHECK-LABEL: @func()
-; CHECK: %dec = atomicrmw fsub ptr addrspace(3) @llvm.amdgcn.module.lds, float 1.000000e+00 monotonic, align 8
 define void @func() {
+; CHECK-LABEL: define void @func() {
+; CHECK-NEXT:    [[DEC:%.*]] = atomicrmw fsub ptr addrspace(3) @llvm.amdgcn.module.lds, float 1.000000e+00 monotonic, align 8
+; CHECK-NEXT:    [[UNUSED0:%.*]] = atomicrmw add ptr addrspace(1) @ignored, i64 1 monotonic, align 8
+; CHECK-NEXT:    ret void
+;
   %dec = atomicrmw fsub ptr addrspace(3) @tolower, float 1.0 monotonic
   %unused0 = atomicrmw add ptr addrspace(1) @ignored, i64 1 monotonic
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 61bb25f3e42ace..82004b84275e6f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=OPT %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=GCN %s
 
@@ -30,7 +31,7 @@
 
 
 define void @f0() {
-; OPT-LABEL: @f0(
+; OPT-LABEL: define void @f0() {
 ; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; OPT-NEXT:    [[V02:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V02]], align 4
@@ -71,7 +72,7 @@ define void @f0() {
 }
 
 define void @f1() {
-; OPT-LABEL: @f1(
+; OPT-LABEL: define void @f1() {
 ; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; OPT-NEXT:    [[V12:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V12]], align 4
@@ -112,7 +113,7 @@ define void @f1() {
 }
 
 define void @f2() {
-; OPT-LABEL: @f2(
+; OPT-LABEL: define void @f2() {
 ; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
@@ -153,7 +154,7 @@ define void @f2() {
 }
 
 define void @f3() {
-; OPT-LABEL: @f3(
+; OPT-LABEL: define void @f3() {
 ; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; OPT-NEXT:    [[V32:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 3
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V32]], align 4
@@ -195,7 +196,8 @@ define void @f3() {
 
 ; Doesn't access any via a function, won't be in the lookup table
 define amdgpu_kernel void @kernel_no_table() {
-; OPT-LABEL: @kernel_no_table() #0 {
+; OPT-LABEL: define amdgpu_kernel void @kernel_no_table(
+; OPT-SAME: ) #[[ATTR0:[0-9]+]] {
 ; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
 ; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 8
 ; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
@@ -218,8 +220,9 @@ define amdgpu_kernel void @kernel_no_table() {
 
 ; Access two variables, will allocate those two
 define amdgpu_kernel void @k01() {
-; OPT-LABEL: @k01() #0 !llvm.amdgcn.lds.kernel.id !1 {
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ]
+; OPT-LABEL: define amdgpu_kernel void @k01(
+; OPT-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id !1 {
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ], !alias.scope !2, !noalias !5
 ; OPT-NEXT:    call void @f0()
 ; OPT-NEXT:    call void @f1()
 ; OPT-NEXT:    ret void
@@ -256,8 +259,9 @@ define amdgpu_kernel void @k01() {
 }
 
 define amdgpu_kernel void @k23() {
-; OPT-LABEL: @k23() #1 !llvm.amdgcn.lds.kernel.id !7 {
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ]
+; OPT-LABEL: define amdgpu_kernel void @k23(
+; OPT-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id !7 {
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope !8, !noalias !11
 ; OPT-NEXT:    call void @f2()
 ; OPT-NEXT:    call void @f3()
 ; OPT-NEXT:    ret void
@@ -295,8 +299,9 @@ define amdgpu_kernel void @k23() {
 
 ; Access and allocate three variables
 define amdgpu_kernel void @k123() {
-; OPT-LABEL: @k123() #2 !llvm.amdgcn.lds.kernel.id !13 {
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ]
+; OPT-LABEL: define amdgpu_kernel void @k123(
+; OPT-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !13 {
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !14, !noalias !17
 ; OPT-NEXT:    call void @f1()
 ; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21
 ; OPT-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 8
diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
index 57362b71cf6656..7aa1d212ed417a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
@@ -1,6 +1,9 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt -S -mtriple=amdgcn--  -amdgpu-lower-ctor-dtor < %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf -s - 2>&1 | FileCheck %s -check-prefix=CHECK-VIS
 
+
+; UTC_ARGS: --disable
 @llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }, { i32, ptr, ptr } { i32 1, ptr @foo.5, ptr null }]
 @llvm.global_dtors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }, { i32, ptr, ptr } { i32 1, ptr @bar.5, ptr null }]
 
@@ -9,32 +12,8 @@
 ; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @llvm.used = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini]
+; UTC_ARGS: --enable
 
-; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.init() #0
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__init_array_start, ptr addrspace(1) @__init_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.entry:
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ @__init_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
-; CHECK-NEXT:    [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
-; CHECK-NEXT:    call void [[CALLBACK]]()
-; CHECK-NEXT:    [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT:    [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__init_array_end
-; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    ret void
-
-; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini() #1
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
-; CHECK:       while.entry:
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
-; CHECK-NEXT:    [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
-; CHECK-NEXT:    call void [[CALLBACK]]()
-; CHECK-NEXT:    [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT:    [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
-; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
-; CHECK:       while.end:
-; CHECK-NEXT:    ret void
 
 ; CHECK-VIS: FUNC   WEAK PROTECTED {{.*}} amdgcn.device.init
 ; CHECK-VIS: OBJECT WEAK DEFAULT {{.*}} amdgcn.device.init.kd
@@ -57,5 +36,48 @@ define internal void @bar.5() {
   ret void
 }
 
-; CHECK: attributes #0 = { "amdgpu-flat-work-group-size"="1,1" "device-init" }
-; CHECK: attributes #1 = { "amdgpu-flat-work-group-size"="1,1" "device-fini" }
+; CHECK-LABEL: define internal void @foo() {
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void @bar() {
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void @foo.5() {
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void @bar.5() {
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.init(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__init_array_start, ptr addrspace(1) @__init_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.entry:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ @__init_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
+; CHECK-NEXT:    [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT:    call void [[CALLBACK]]()
+; CHECK-NEXT:    [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
+; CHECK-NEXT:    [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__init_array_end
+; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define weak_odr amdgpu_kernel void @amdgcn.device.fini(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 icmp ne (ptr addrspace(1) @__fini_array_start, ptr addrspace(1) @__fini_array_end), label [[WHILE_ENTRY:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.entry:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ @__fini_array_start, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[WHILE_ENTRY]] ]
+; CHECK-NEXT:    [[CALLBACK:%.*]] = load ptr, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT:    call void [[CALLBACK]]()
+; CHECK-NEXT:    [[NEXT]] = getelementptr ptr addrspace(1), ptr addrspace(1) [[PTR]], i64 1
+; CHECK-NEXT:    [[END:%.*]] = icmp eq ptr addrspace(1) [[NEXT]], @__fini_array_end
+; CHECK-NEXT:    br i1 [[END]], label [[WHILE_END]], label [[WHILE_ENTRY]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll
index 51a0a50fbbff5c..00e226291e68b6 100644
--- a/llvm/test/CodeGen/AMDGPU/madmk.ll
+++ b/llvm/test/CodeGen/AMDGPU/madmk.ll
@@ -11,7 +11,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN-LABEL: {{^}}madmk_f32:
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
+; GCN: v_madmk_f32 {{v[0-9]+}}, [[VA]], 0x41200000, [[VB]]
 define amdgpu_kernel void @madmk_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -96,7 +96,7 @@ define amdgpu_kernel void @s_s_madmk_f32(ptr addrspace(1) noalias %out, [8 x i32
 ; GCN-DAG: s_load_dword [[SREG:s[0-9]+]]
 ; GCN-DAG: buffer_load_dword [[VREG1:v[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG]]
-; GCN: v_mac_f32_e32 [[VREG2]], 0x41200000, [[VREG1]]
+; GCN: v_madmk_f32 {{v[0-9]+}}, [[VREG1]], 0x41200000, [[VREG2]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_s_madmk_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, float %b) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -171,8 +171,9 @@ define amdgpu_kernel void @no_madmk_src2_modifier_f32(ptr addrspace(1) noalias %
 
 ; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
 ; GCN: buffer_load_dword [[A:v[0-9]+]]
-; GCN: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
-; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[SK]], 2.0
+; GCN: v_mov_b32_e32 [[B:v[0-9]+]], 2.0
+; GCN: v_madmk_f32 {{v[0-9]+}}, [[A]], 0x41200000, [[B]]
+
 define amdgpu_kernel void @madmk_add_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
index a399b509014dd7..b54cc27db1d12c 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -112,8 +112,8 @@ entry:
 ; A subregister use operand should not be tied.
 ; CHECK-LABEL: {{^}}no_fold_tied_subregister:
 ; CHECK: buffer_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
-; CHECK: buffer_store_dword v[[LO]]
+; CHECK: v_madmk_f32 v[[RES:[0-9]+]], v[[HI]], 0x41200000, v[[LO]]
+; CHECK: buffer_store_dword v[[RES]]
 define amdgpu_kernel void @no_fold_tied_subregister() #1 {
   %tmp1 = load volatile <2 x float>, ptr addrspace(1) undef
   %tmp2 = extractelement <2 x float> %tmp1, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 705a2af7395906..4f2fd3f50494c9 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -19,14 +19,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_sub_u32 s4, 0, s10
 ; GCN-NEXT:    s_subb_u32 s5, 0, s11
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s12, s3, 31
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    s_add_u32 s2, s2, s12
@@ -247,12 +247,12 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
 ; GCN-NEXT:    v_subb_u32_e32 v8, vcc, 0, v2, vcc
-; GCN-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GCN-NEXT:    v_madmk_f32 v5, v6, 0x4f800000, v5
 ; GCN-NEXT:    v_rcp_f32_e32 v5, v5
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GCN-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v6, v6
-; GCN-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GCN-NEXT:    v_madmk_f32 v5, v6, 0xcf800000, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GCN-NEXT:    v_mul_hi_u32 v9, v7, v5
@@ -1093,12 +1093,12 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GCN-NEXT:    s_sub_u32 s4, 0, s2
 ; GCN-NEXT:    s_subb_u32 s5, 0, s3
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
@@ -1287,12 +1287,12 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v1
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; GCN-NEXT:    v_madmk_f32 v3, v4, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; GCN-NEXT:    v_madmk_f32 v3, v4, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, v5, v3
@@ -1484,12 +1484,12 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v1
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; GCN-NEXT:    v_madmk_f32 v3, v4, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; GCN-NEXT:    v_madmk_f32 v3, v4, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, v5, v3
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index fdecb65981c537..8da720d7f991cd 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1,46 +1,86 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
+; RUN: llc -global-isel=0 -march=amdgcn < %s | FileCheck --check-prefixes=SI,SI-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn < %s | FileCheck --check-prefixes=SI,SI-GISEL %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,VI-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,VI-GISEL %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
 
 ; Test that add/sub with a constant is swapped to sub/add with negated
 ; constant to minimize code size.
 
 define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_x_sub_64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_x_sub_64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_x_sub_64:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_x_sub_64:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_x_sub_64:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_x_sub_64:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_64:
 ; GFX9:       ; %bb.0:
@@ -87,50 +127,97 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 }
 
 define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_x_sub_64_multi_use:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_x_sub_64_multi_use:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1] glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_load_dword v4, v[0:1] glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
-; VI-NEXT:    v_subrev_u32_e32 v3, vcc, 64, v4
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_dword v[0:1], v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1] glc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    flat_load_dword v4, v[0:1] glc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
+; VI-SDAG-NEXT:    v_subrev_u32_e32 v3, vcc, 64, v4
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v3
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1] glc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    flat_load_dword v4, v[0:1] glc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
+; VI-GISEL-NEXT:    v_subrev_u32_e32 v3, vcc, 64, v4
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v3
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
 ; GFX9:       ; %bb.0:
@@ -198,38 +285,73 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_64_sub_x:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, 64, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_64_sub_x:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 64, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_64_sub_x:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_sub_i32_e32 v2, vcc, 64, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_64_sub_x:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_64_sub_x:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_sub_u32_e32 v2, vcc, 64, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_64_sub_x:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_sub_u32_e32 v2, vcc, 64, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_64_sub_x:
 ; GFX9:       ; %bb.0:
@@ -276,73 +398,143 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
 }
 
 define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_x_sub_65:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffbf, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_x_sub_65:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffbf, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_i32_x_sub_65:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_i32_x_sub_65:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_i32_x_sub_65:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_x_sub_65:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffbf, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_x_sub_65:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0x41, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_x_sub_65:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffbf, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_x_sub_65:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, 0x41, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_i32_x_sub_65:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_i32_x_sub_65:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 0x41, v1
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_i32_x_sub_65:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_i32_x_sub_65:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0x41, v1
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_i32_x_sub_65:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_i32_x_sub_65:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0x41, v1
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
@@ -354,38 +546,73 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
 }
 
 define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_65_sub_x:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, 0x41, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_65_sub_x:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 0x41, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_65_sub_x:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_sub_i32_e32 v2, vcc, 0x41, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_65_sub_x:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 0x41, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_65_sub_x:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_sub_u32_e32 v2, vcc, 0x41, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_65_sub_x:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_sub_u32_e32 v2, vcc, 0x41, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_65_sub_x:
 ; GFX9:       ; %bb.0:
@@ -432,73 +659,143 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
 }
 
 define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_x_sub_neg16:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_x_sub_neg16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_i32_x_sub_neg16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 16, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_i32_x_sub_neg16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 16, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_i32_x_sub_neg16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 16, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_x_sub_neg16:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 16, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_x_sub_neg16:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, -16, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_x_sub_neg16:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u32_e32 v2, vcc, 16, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_x_sub_neg16:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, -16, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, -16, v1
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, -16, v1
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
@@ -510,38 +807,73 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
 }
 
 define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_neg16_sub_x:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, -16, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_neg16_sub_x:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, -16, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_neg16_sub_x:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_sub_i32_e32 v2, vcc, -16, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_neg16_sub_x:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, -16, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_neg16_sub_x:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_sub_u32_e32 v2, vcc, -16, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_neg16_sub_x:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_sub_u32_e32 v2, vcc, -16, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_neg16_sub_x:
 ; GFX9:       ; %bb.0:
@@ -588,73 +920,143 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
 }
 
 define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_x_sub_neg17:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 17, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_x_sub_neg17:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 17, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_i32_x_sub_neg17:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v1, 17, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_i32_x_sub_neg17:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 17, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_i32_x_sub_neg17:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 17, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_x_sub_neg17:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 17, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_x_sub_neg17:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0xffffffef, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_x_sub_neg17:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u32_e32 v2, vcc, 17, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_x_sub_neg17:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u32_e32 v2, vcc, 0xffffffef, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 17, v1
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 0xffffffef, v1
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 17, v1
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0xffffffef, v1
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 17, v1
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0xffffffef, v1
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
@@ -666,38 +1068,73 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
 }
 
 define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i32_neg17_sub_x:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, 0xffffffef, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i32_neg17_sub_x:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 0xffffffef, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i32_neg17_sub_x:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_sub_i32_e32 v2, vcc, 0xffffffef, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i32_neg17_sub_x:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 0xffffffef, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i32_neg17_sub_x:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_sub_u32_e32 v2, vcc, 0xffffffef, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i32_neg17_sub_x:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_sub_u32_e32 v2, vcc, 0xffffffef, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_neg17_sub_x:
 ; GFX9:       ; %bb.0:
@@ -799,38 +1236,73 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
 }
 
 define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i16_x_sub_64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i16_x_sub_64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ushort v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i16_x_sub_64:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-SDAG-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i16_x_sub_64:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i16_x_sub_64:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_ushort v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v2, 64, v3
+; VI-SDAG-NEXT:    flat_store_short v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i16_x_sub_64:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_ushort v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v3
+; VI-GISEL-NEXT:    flat_store_short v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64:
 ; GFX9:       ; %bb.0:
@@ -877,41 +1349,79 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 }
 
 define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 64, v3
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
-; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT:    flat_load_ushort v2, v[1:2]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_subrev_u16_e32 v2, 64, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v0, vcc, 64, v3
+; SI-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-SDAG-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 64, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-SDAG-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-SDAG-NEXT:    flat_load_ushort v2, v[1:2]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v2, 64, v2
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 1, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; VI-GISEL-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-GISEL-NEXT:    flat_load_ushort v2, v[1:2]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v2
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX9:       ; %bb.0:
@@ -965,50 +1475,97 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 }
 
 define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_i16_x_sub_64_multi_use:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
-; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_i16_x_sub_64_multi_use:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ushort v3, v[0:1] glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_load_ushort v4, v[0:1] glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; VI-NEXT:    v_subrev_u16_e32 v3, 64, v4
-; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_short v[0:1], v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-SDAG-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_ushort v3, v[0:1] glc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v2, 64, v3
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 64, v4
+; VI-SDAG-NEXT:    flat_store_short v[0:1], v2
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    flat_store_short v[0:1], v3
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_ushort v3, v[0:1] glc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v3
+; VI-GISEL-NEXT:    v_subrev_u16_e32 v3, 64, v4
+; VI-GISEL-NEXT:    flat_store_short v[0:1], v2
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    flat_store_short v[0:1], v3
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
 ; GFX9:       ; %bb.0:
@@ -1076,44 +1633,88 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_sub_64_64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_sub_64_64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 64
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_subrev_u16_e32 v3, 64, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_64:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_64:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 64
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 64, v3
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 64
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v3
+; VI-GISEL-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
 ; GFX9:       ; %bb.0:
@@ -1160,56 +1761,112 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_sub_7_64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_sub_7_64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 64
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, -7, v3
-; VI-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s2, 0x400007
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 7, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 64
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_e32 v2, -7, v3
+; VI-SDAG-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 64
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 7, v3
+; VI-GISEL-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x400007
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x400007
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
 ; GFX10:       ; %bb.0:
@@ -1245,56 +1902,112 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_sub_64_123:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xff850000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_sub_64_123:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0xffffff85
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_subrev_u16_e32 v3, 64, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s2, 0x7b0040
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xff850000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7b, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff85
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 64, v3
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7b
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_subrev_u16_e32 v2, 64, v3
+; VI-GISEL-NEXT:    v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x7b0040
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b0040
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
 ; GFX10:       ; %bb.0:
@@ -1331,42 +2044,84 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 
 ; Can fold 0 and inline immediate in other half.
 define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_sub_7_0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff
-; SI-NEXT:    v_bfi_b32 v2, s4, v3, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_sub_7_0:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_add_u16_e32 v3, -7, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 7, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; VI-SDAG-NEXT:    v_add_u16_e32 v3, -7, v3
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-GISEL-NEXT:    v_subrev_u16_e32 v3, 7, v3
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
 ; GFX9:       ; %bb.0:
@@ -1414,40 +2169,82 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
 
 ; Can fold 0 and inline immediate in other half.
 define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_sub_0_16:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_sub_0_16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v2, -16
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_16:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_16:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 16, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, -16
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 16
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
 ; GFX9:       ; %bb.0:
@@ -1494,52 +2291,106 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_sub_0_1_0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x3c000000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_sub_0_1_0:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_brev_b32 s2, 35
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0x3c000000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0xffffc400, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffc400
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_brev_b32 s2, 35
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 35
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
 ; GFX10:       ; %bb.0:
@@ -1575,52 +2426,106 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xbc000000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 0xffffbc00
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_brev_b32 s2, 34
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v1, v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xbc000000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0x4400, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xffffbc00
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_brev_b32 s2, 34
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 34
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
 ; GFX10:       ; %bb.0:
@@ -1657,79 +2562,160 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 
 ; -32 isn't an inline immediate, but 32 is
 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 32
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_subrev_u16_e32 v3, 32, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_movk_i32 s2, 0xffe0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 32
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 32, v3
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffffe0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffe0, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffe0ffe0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1741,75 +2727,153 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_0_neg32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_0_neg32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 32
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe0, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 32
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffffe0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffe00000
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1821,77 +2885,155 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg32_0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff
-; SI-NEXT:    v_bfi_b32 v2, s4, v3, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg32_0:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_subrev_u16_e32 v3, 32, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v3, 32, v3
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-GISEL-NEXT:    v_add_u16_e32 v3, 0xffe0, v3
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffe0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1904,79 +3046,158 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
 
 ; 16 and -16 are both inline immediates
 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, -16
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, -16, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, -16
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_e32 v2, -16, v3
+; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, -16
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, -16, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1988,75 +3209,152 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_0_neg16:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_0_neg16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v2, -16
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, -16
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, -16
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2068,77 +3366,154 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg16_0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff
-; SI-NEXT:    v_bfi_b32 v2, s4, v3, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg16_0:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_add_u16_e32 v3, -16, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; VI-SDAG-NEXT:    v_add_u16_e32 v3, -16, v3
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-GISEL-NEXT:    v_add_u16_e32 v3, -16, v3
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 16
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, -16
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2150,80 +3525,161 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg_fpone:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xc4000000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg_fpone:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0xffffc400
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 0xc400, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s2, 0x3c003c00
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xc4000000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_movk_i32 s2, 0xc400
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc400
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_e32 v2, 0xc400, v3
+; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffc400
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xc400, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x3c003c00
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc400c400
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400, v1 op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400, v1 op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2235,80 +3691,161 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44000000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 0x4400, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s2, 0xbc00bc00
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0x44000000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_movk_i32 s2, 0x4400
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_e32 v2, 0x4400, v3
+; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0x4400, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0xbc00bc00
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x44004400
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2320,80 +3857,161 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 2.0, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x4000
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 0x4000, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s2, 0xc000c000
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 2.0, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_movk_i32 s2, 0x4000
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_e32 v2, 0x4000, v3
+; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4000
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0x4000, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0xc000c000
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x40004000
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0x4000, v1 op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0x4000, v1 op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2405,80 +4023,161 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v2
-; SI-NEXT:    s_mov_b32 s4, 0xffff0000
-; SI-NEXT:    v_bfi_b32 v2, s4, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, -2.0, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0xffffc000
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 0xc000, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s2, 0x40004000
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
+; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, -2.0, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_movk_i32 s2, 0xc000
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc000
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_add_u16_e32 v2, 0xc000, v3
+; VI-SDAG-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffc000
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xc000, v3
+; VI-GISEL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x40004000
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc000c000
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xc000, v1 op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xc000, v1 op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2490,75 +4189,152 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_undef_neg32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_undef_neg32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 32
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 32
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffffe0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    s_and_b32 s0, 0xffff, s0
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, s0, v2
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, 0xffe00000
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, s2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 0xffe0 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, 0xffe0 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2570,74 +4346,149 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
-; SI-LABEL: v_test_v2i16_x_add_neg32_undef:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 32, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_test_v2i16_x_add_neg32_undef:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_subrev_u16_e32 v2, 32, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: v_test_v2i16_x_add_neg32_undef:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_subrev_i32_e32 v2, vcc, 32, v2
+; SI-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    flat_load_dword v3, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_subrev_u16_e32 v2, 32, v3
+; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
+; VI-GISEL-NEXT:    s_and_b32 s2, 0xffff, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-GISEL-NEXT:    s_lshl_b32 s0, s2, 16
+; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffe0, v3
+; VI-GISEL-NEXT:    v_or_b32_e32 v2, s0, v2
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffffe0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    s_nop 0
+; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1]
+; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_nop 0
+; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 9955aca981518d..08db1e7fee259d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10093,7 +10093,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240
 ; GFX6-NEXT:    s_addc_u32 s41, s41, 0
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83800
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10103,7 +10103,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
+; GFX6-NEXT:    s_mov_b32 s2, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10112,7 +10112,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT:    s_mov_b32 s2, 0x83400
+; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10121,7 +10121,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
+; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10130,7 +10130,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10139,7 +10139,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT:    s_mov_b32 s2, 0x82800
+; GFX6-NEXT:    s_mov_b32 s2, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10148,7 +10148,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT:    s_mov_b32 s2, 0x82400
+; GFX6-NEXT:    s_mov_b32 s2, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10157,7 +10157,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT:    s_mov_b32 s2, 0x82000
+; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10166,7 +10166,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10175,7 +10175,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT:    s_mov_b32 s2, 0x81800
+; GFX6-NEXT:    s_mov_b32 s2, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10184,7 +10184,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x81400
+; GFX6-NEXT:    s_mov_b32 s2, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10218,7 +10218,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT:    s_mov_b32 s2, 0x81000
+; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10245,7 +10245,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s12, 0x84000
+; GFX6-NEXT:    s_mov_b32 s12, 0x83c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10273,183 +10273,197 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX6-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX6-NEXT:  ; %bb.1: ; %bb0
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s8, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s9, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s10, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s11, 3
-; GFX6-NEXT:    v_writelane_b32 v0, s12, 4
-; GFX6-NEXT:    v_writelane_b32 v0, s13, 5
-; GFX6-NEXT:    v_writelane_b32 v0, s14, 6
-; GFX6-NEXT:    v_writelane_b32 v0, s15, 7
-; GFX6-NEXT:    s_mov_b32 s36, 0x80800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s8, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s9, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s10, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s11, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s12, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
+; GFX6-NEXT:    s_mov_b32 s38, 0x84400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s36, 0x84000
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x83c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s8, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s9, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s10, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s11, v0, 3
-; GFX6-NEXT:    v_readlane_b32 s12, v0, 4
-; GFX6-NEXT:    v_readlane_b32 s13, v0, 5
-; GFX6-NEXT:    v_readlane_b32 s14, v0, 6
-; GFX6-NEXT:    v_readlane_b32 s15, v0, 7
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s8, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s9, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s10, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s11, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s12, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s13, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s14, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s15, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s16, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s17, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s18, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s19, 3
-; GFX6-NEXT:    v_writelane_b32 v0, s20, 4
-; GFX6-NEXT:    v_writelane_b32 v0, s21, 5
-; GFX6-NEXT:    v_writelane_b32 v0, s22, 6
-; GFX6-NEXT:    v_writelane_b32 v0, s23, 7
-; GFX6-NEXT:    s_mov_b32 s36, 0x84800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s16, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s17, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s18, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s19, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s20, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
+; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s36, 0x80800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x84400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s16, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s17, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s18, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s19, v0, 3
-; GFX6-NEXT:    v_readlane_b32 s20, v0, 4
-; GFX6-NEXT:    v_readlane_b32 s21, v0, 5
-; GFX6-NEXT:    v_readlane_b32 s22, v0, 6
-; GFX6-NEXT:    v_readlane_b32 s23, v0, 7
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s16, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s17, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s18, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s19, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s20, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s21, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s22, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s23, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s24, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s25, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s26, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s27, 3
-; GFX6-NEXT:    v_writelane_b32 v0, s28, 4
-; GFX6-NEXT:    v_writelane_b32 v0, s29, 5
-; GFX6-NEXT:    v_writelane_b32 v0, s30, 6
-; GFX6-NEXT:    v_writelane_b32 v0, s31, 7
-; GFX6-NEXT:    s_mov_b32 s36, 0x85000
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s24, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s25, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s26, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s27, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s28, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
+; GFX6-NEXT:    s_mov_b32 s38, 0x85400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s36, 0x84800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s24, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s25, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s26, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s27, v0, 3
-; GFX6-NEXT:    v_readlane_b32 s28, v0, 4
-; GFX6-NEXT:    v_readlane_b32 s29, v0, 5
-; GFX6-NEXT:    v_readlane_b32 s30, v0, 6
-; GFX6-NEXT:    v_readlane_b32 s31, v0, 7
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s26, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s27, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s28, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s29, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s30, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_writelane_b32 v4, s0, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
+; GFX6-NEXT:    s_mov_b32 s38, 0x85c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s4, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s5, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s6, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s7, 3
-; GFX6-NEXT:    s_mov_b32 s36, 0x85800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
+; GFX6-NEXT:    s_mov_b32 s36, 0x86000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s2, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s3, 1
-; GFX6-NEXT:    s_mov_b32 s4, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s4 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s2, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s3, 1
+; GFX6-NEXT:    s_mov_b32 s4, 0x86400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x85000
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x85400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v0, 3
-; GFX6-NEXT:    v_readlane_b32 s4, v0, 4
-; GFX6-NEXT:    v_readlane_b32 s5, v0, 5
-; GFX6-NEXT:    v_readlane_b32 s6, v0, 6
-; GFX6-NEXT:    v_readlane_b32 s7, v0, 7
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s4, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s5, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s6, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s7, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x2160
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2180
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, v1, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s36, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s37, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s38, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s39, v0, 3
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s36, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s37, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s38, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s39, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 vcc, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x2170
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2190
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, v1, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s34, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s35, v0, 1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s34, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s35, v4, 1
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -10458,31 +10472,44 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[34:35], vcc
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x85e00
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s6, 0x85c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v0, 3
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s2, 0x84400
+; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(4)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v17
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v18
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v19
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v20
+; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_mov_b32 s2, 0x84800
 ; GFX6-NEXT:    v_mov_b32_e32 v20, v3
 ; GFX6-NEXT:    buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s2, 0x84000
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
 ; GFX6-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v17, v0
@@ -10518,14 +10545,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x83800
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[5:6], 8
 ; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83800
+; GFX6-NEXT:    s_mov_b32 s4, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10533,7 +10560,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x83400
+; GFX6-NEXT:    s_mov_b32 s4, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10541,7 +10568,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x83000
+; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10549,7 +10576,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10557,7 +10584,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82800
+; GFX6-NEXT:    s_mov_b32 s4, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10565,7 +10592,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82400
+; GFX6-NEXT:    s_mov_b32 s4, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10573,7 +10600,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82000
+; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10581,7 +10608,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10589,7 +10616,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81800
+; GFX6-NEXT:    s_mov_b32 s4, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10597,7 +10624,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81400
+; GFX6-NEXT:    s_mov_b32 s4, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10605,7 +10632,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81000
+; GFX6-NEXT:    s_mov_b32 s4, 0x80c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80
 ; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64
@@ -10637,19 +10664,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:240
 ; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v7, 1
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v0, s[38:39] offset:224
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v0, s[38:39] offset:192
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v0, s[38:39] offset:160
@@ -10658,7 +10685,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:112
@@ -10719,24 +10746,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, v16
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39]
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, v17
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, v18
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v3, v19
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2100
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_nop 0
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v19, v3
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v18, v2
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v17, v1
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
@@ -10752,18 +10781,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:  .LBB1_2: ; %ret
 ; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    v_lshlrev_b64 v[4:5], 8, v[5:6]
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, s37
 ; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v4, vcc, s36, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:240
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[8:11], off offset:224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:208
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:192
@@ -10773,7 +10802,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:176
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:160
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
@@ -10868,6 +10897,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX10-FLATSCR-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX10-FLATSCR-NEXT:  ; %bb.1: ; %bb0
+; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
+; GFX10-FLATSCR-NEXT:    ;;#ASMEND
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v88, v59
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v92, v63
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v87, v58
@@ -10877,6 +10910,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v90, v61
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v89, v60
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v35
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[64:67], s0 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v68, v39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v34
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v33
@@ -10925,9 +10959,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v30
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v29
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
-; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
-; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v8, v33
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v53
@@ -10959,7 +10990,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v60
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v65
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v66
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v67
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 613349f32e2d5a..24319a639da447 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -15,13 +15,13 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_sub_u32 s0, 0, s12
 ; GCN-NEXT:    s_subb_u32 s1, 0, s13
 ; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
@@ -226,12 +226,12 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v2, vcc
-; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GCN-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
-; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GCN-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_mul_hi_u32 v8, v6, v4
@@ -894,7 +894,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_sub_u32 s0, 0, s12
 ; GCN-NEXT:    s_subb_u32 s1, 0, s13
 ; GCN-NEXT:    s_ashr_i32 s6, s7, 31
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s7, s6
 ; GCN-NEXT:    s_mov_b32 s8, s4
@@ -902,7 +902,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
@@ -1290,13 +1290,13 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_sub_u32 s2, 0, s8
 ; GCN-NEXT:    s_subb_u32 s3, 0, s9
 ; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
@@ -1481,12 +1481,12 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_hi_u32 v6, v4, v2
@@ -1676,12 +1676,12 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_hi_u32 v6, v4, v2
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 70687e15947c1d..7201ffaf561662 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -16,63 +16,88 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x0
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    ; kill: killed $sgpr0_sgpr1
+; CHECK-NEXT:    s_mov_b32 s7, 0x401c0000
+; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40180000
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
 ; CHECK-NEXT:    v_writelane_b32 v2, s0, 1
 ; CHECK-NEXT:    v_writelane_b32 v2, s1, 2
-; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
+; CHECK-NEXT:    s_mov_b32 s1, 0x40220000
 ; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    s_mov_b32 s3, 0x40260000
-; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
+; CHECK-NEXT:    s_mov_b32 s1, 0x40260000
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:  .LBB0_1: ; %for.cond4.preheader
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], 0
-; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    s_mov_b32 s3, 0x40140000
+; CHECK-NEXT:    v_writelane_b32 v2, s6, 9
+; CHECK-NEXT:    v_writelane_b32 v2, s7, 10
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 11
 ; CHECK-NEXT:    v_readlane_b32 s6, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s7, v2, 2
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
 ; CHECK-NEXT:    s_mov_b32 s1, s7
-; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s0, s6
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 6
+; CHECK-NEXT:    s_mov_b32 s0, s2
+; CHECK-NEXT:    v_writelane_b32 v2, s6, 1
+; CHECK-NEXT:    v_writelane_b32 v2, s7, 2
+; CHECK-NEXT:    v_readlane_b32 s6, v2, 9
+; CHECK-NEXT:    v_readlane_b32 s7, v2, 10
+; CHECK-NEXT:    s_mov_b32 s6, s2
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 7
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s6, s0
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 8
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 9
-; CHECK-NEXT:    s_mov_b32 s6, s0
 ; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
 ; CHECK-NEXT:    v_readlane_b32 s1, v2, 4
+; CHECK-NEXT:    s_mov_b32 s3, s1
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s1, s3
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
-; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s0, s6
-; CHECK-NEXT:    s_mov_b32 s2, s6
-; CHECK-NEXT:    s_mov_b32 s4, s6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
+; CHECK-NEXT:    v_readlane_b32 s1, v2, 6
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; CHECK-NEXT:    s_mov_b32 s3, s1
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s1, s3
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 7
+; CHECK-NEXT:    v_readlane_b32 s1, v2, 8
+; CHECK-NEXT:    s_mov_b32 s3, s1
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s1, s3
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s2, v2, 5
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
+; CHECK-NEXT:    s_mov_b32 s4, s0
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s2, v2, 11
 ; CHECK-NEXT:    s_add_i32 s2, s2, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 5
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
-; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
+; CHECK-NEXT:    v_writelane_b32 v2, s2, 11
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 11
+; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index cf30131b8ab58a..012b3f976734de 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2527,7 +2527,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; SI-NEXT:    v_trunc_f32_e32 v3, v3
-; SI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; SI-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
@@ -2626,7 +2626,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; VI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; VI-NEXT:    v_trunc_f32_e32 v3, v3
-; VI-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; VI-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v6, v2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v7, v3
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
@@ -2713,7 +2713,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v3
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index c5ab44e31c0320..e23f3cfad89bc8 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -14,12 +14,12 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GCN-NEXT:    s_sub_u32 s4, 0, s8
 ; GCN-NEXT:    s_subb_u32 s5, 0, s9
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
@@ -211,12 +211,12 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GCN-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
-; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GCN-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
@@ -688,7 +688,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
+; GCN-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_mov_b32 s2, -1
@@ -886,12 +886,12 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GCN-NEXT:    s_sub_u32 s4, 0, s2
 ; GCN-NEXT:    s_subb_u32 s5, 0, s3
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
@@ -1067,12 +1067,12 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
@@ -1335,7 +1335,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_mov_b32 s2, -1
@@ -1509,7 +1509,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 894c96acbbcd6b..f68d14a32b929a 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -15,13 +15,13 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_sub_u32 s0, 0, s12
 ; GCN-NEXT:    s_subb_u32 s1, 0, s13
 ; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
@@ -221,12 +221,12 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GCN-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
 ; GCN-NEXT:    v_rcp_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
-; GCN-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GCN-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
@@ -716,13 +716,13 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_sub_u32 s0, 0, s6
 ; GCN-NEXT:    s_subb_u32 s1, 0, s7
 ; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
@@ -903,7 +903,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1086,12 +1086,12 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GCN-NEXT:    v_subb_u32_e32 v5, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GCN-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll
index 3df7f3c26aad82..2b5762e1fa2a52 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll
@@ -253,7 +253,7 @@ bb:
 
 ; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]]
 ; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}}
+; SI: v_madmk_f32 v{{[0-9]+}}, v{{[0-9]+}}, 0x41000000, v{{[0-9]+}}
 
 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 5ec9284c870c18..a8cbb0000ce722 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -131,9 +131,10 @@ define amdgpu_kernel void @fptrunc(
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 94b822ac488757..cadc23414dcac1 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -743,12 +743,12 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GFX1032-NEXT:    s_sub_u32 s9, 0, s4
 ; GFX1032-NEXT:    s_subb_u32 s10, 0, s5
-; GFX1032-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX1032-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1032-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1032-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX1032-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX1032-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1032-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX1032-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1032-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1032-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
@@ -905,12 +905,12 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GFX1064-NEXT:    s_sub_u32 s9, 0, s4
 ; GFX1064-NEXT:    s_subb_u32 s10, 0, s5
-; GFX1064-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1064-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1064-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX1064-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX1064-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1064-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s8, v1
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long.ll
index b09976320b5622..bca505e5edd5f1 100644
--- a/llvm/test/CodeGen/AVR/branch-relaxation-long.ll
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=avr  -mattr=avr3 | FileCheck %s
+; RUN: llc < %s -march=avr -mattr=avr3 | FileCheck %s
+; RUN: llc < %s -march=avr -mattr=avr2 | FileCheck --check-prefix=AVR2 %s
 
 ; CHECK-LABEL: relax_to_jmp:
 ; CHECK: cpi     r{{[0-9]+}}, 0
@@ -7,6 +8,18 @@
 ; CHECK: [[BB1]]:
 ; CHECK: nop
 ; CHECK: [[BB2]]:
+
+;; A `RJMP` is generated instead of expected `JMP` for AVR2,
+;; and it is up to the linker to report 'out of range' or
+;; 'exceed flash maximum size'.
+; AVR2-LABEL: relax_to_jmp:
+; AVR2: cpi     r{{[0-9]+}}, 0
+; AVR2: brne    [[BB1:.LBB[0-9]+_[0-9]+]]
+; AVR2: rjmp    [[BB2:.LBB[0-9]+_[0-9]+]]
+; AVR2: [[BB1]]:
+; AVR2: nop
+; AVR2: [[BB2]]:
+
 define i8 @relax_to_jmp(i1 %a) {
 entry-block:
   br i1 %a, label %hello, label %finished
@@ -2075,6 +2088,18 @@ finished:
 ; CHECK: breq    [[BB2:.LBB[0-9]+_[0-9]+]]
 ; CHECK: jmp     [[BB1]]
 ; CHECK: [[BB2]]:
+
+;; A `RJMP` is generated instead of expected `JMP` for AVR2,
+;; and it is up to the linker to report 'out of range' or
+;; 'exceed flash maximum size'.
+; AVR2-LABEL: relax_to_jmp_backwards:
+; AVR2: [[BB1:.LBB[0-9]+_[0-9]+]]
+; AVR2: nop
+; AVR2: cpi     r{{[0-9]+}}, 0
+; AVR2: breq    [[BB2:.LBB[0-9]+_[0-9]+]]
+; AVR2: rjmp    [[BB1]]
+; AVR2: [[BB2]]:
+
 define i8 @relax_to_jmp_backwards(i1 %a) {
 entry-block:
   br label %hello
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation.ll b/llvm/test/CodeGen/AVR/branch-relaxation.ll
index cee3963c1ee3ef..3958cca8f20758 100644
--- a/llvm/test/CodeGen/AVR/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AVR/branch-relaxation.ll
@@ -1,10 +1,22 @@
 ; RUN: llc < %s -march=avr | FileCheck %s
+; RUN: llc < %s -march=avr -mcpu=avr5 | FileCheck -check-prefix=AVR5 %s
 
 ; CHECK-LABEL: relax_breq
 ; CHECK: cpi     r{{[0-9]+}}, 0
 ; CHECK: brne    .LBB0_1
 ; CHECK: rjmp    .LBB0_2
-; .LBB0_1:
+; CHECK: .LBB0_1:
+; CHECK: nop
+; CHECK: .LBB0_2:
+
+; AVR5-LABEL: relax_breq
+; AVR5:         andi    r24, 1
+; AVR5:         cpi     r24, 0
+; AVR5:         brne    .LBB0_1
+; AVR5:         rjmp    .LBB0_2
+; AVR5:       .LBB0_1:
+; AVR5:         nop
+; AVR5:       .LBB0_2:
 
 define i8 @relax_breq(i1 %a) {
 entry-block:
@@ -70,8 +82,14 @@ finished:
 ; CHECK: cpi     r{{[0-9]+}}, 0
 ; CHECK: breq    [[END_BB:.LBB[0-9]+_[0-9]+]]
 ; CHECK: nop
-; ...
-; .LBB0_1:
+; CHECK: [[END_BB]]
+
+; AVR5-LABEL: no_relax_breq
+; AVR5:         cpi     r{{[0-9]+}}, 0
+; AVR5:         breq    [[END_BB:.LBB[0-9]+_[0-9]+]]
+; AVR5:         nop
+; AVR5:       [[END_BB]]
+
 define i8 @no_relax_breq(i1 %a) {
 entry-block:
   br i1 %a, label %hello, label %finished
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
index f38cd4cb336d33..ecbfec96a19b03 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -1,6 +1,6 @@
-; RUN: opt -O2 %s | llvm-dis > %t1
-; RUN: llc -filetype=asm -o - %t1 | FileCheck -check-prefixes=CHECK,CHECK-EB,CHECK-ALU64 %s
-; RUN: llc -mattr=+alu32 -filetype=asm -o - %t1 | FileCheck -check-prefixes=CHECK,CHECK-EB,CHECK-ALU32 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -O2 -S < %s | llc -filetype=asm | FileCheck -check-prefixes=CHECK,CHECK-ALU64 %s
+; RUN: opt -O2 -S < %s | llc -mattr=+alu32 -filetype=asm | FileCheck -check-prefixes=CHECK,CHECK-ALU32 %s
 ; Source code:
 ;   struct s {
 ;     unsigned long long f1;
@@ -26,6 +26,68 @@ target triple = "bpfeb"
 
 ; Function Attrs: nounwind readnone
 define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+; CHECK-ALU64-LABEL: test:
+; CHECK-ALU64:       .Ltest$local:
+; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
+; CHECK-ALU64-NEXT:  .Lfunc_begin0:
+; CHECK-ALU64-NEXT:    .loc 1 11 0 # test.c:11:0
+; CHECK-ALU64-NEXT:    .cfi_sections .debug_frame
+; CHECK-ALU64-NEXT:    .cfi_startproc
+; CHECK-ALU64-NEXT:  # %bb.0: # %entry
+; CHECK-ALU64-NEXT:    #DEBUG_VALUE: test:arg <- $r1
+; CHECK-ALU64-NEXT:  .Ltmp0:
+; CHECK-ALU64-NEXT:    r1 = 20
+; CHECK-ALU64-NEXT:  .Ltmp1:
+; CHECK-ALU64-NEXT:  .Ltmp2:
+; CHECK-ALU64-NEXT:  .Ltmp3:
+; CHECK-ALU64-NEXT:    r0 = 4
+; CHECK-ALU64-NEXT:  .Ltmp4:
+; CHECK-ALU64-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
+; CHECK-ALU64-NEXT:  .Ltmp5:
+; CHECK-ALU64-NEXT:  .Ltmp6:
+; CHECK-ALU64-NEXT:    r0 += r1
+; CHECK-ALU64-NEXT:  .Ltmp7:
+; CHECK-ALU64-NEXT:    r1 = 45
+; CHECK-ALU64-NEXT:    .loc 1 13 67 # test.c:13:67
+; CHECK-ALU64-NEXT:  .Ltmp8:
+; CHECK-ALU64-NEXT:    r0 += r1
+; CHECK-ALU64-NEXT:    .loc 1 12 3 # test.c:12:3
+; CHECK-ALU64-NEXT:  .Ltmp9:
+; CHECK-ALU64-NEXT:    exit
+; CHECK-ALU64-NEXT:  .Ltmp10:
+; CHECK-ALU64-NEXT:  .Ltmp11:
+;
+; CHECK-ALU32-LABEL: test:
+; CHECK-ALU32:       .Ltest$local:
+; CHECK-ALU32-NEXT:    .type .Ltest$local,@function
+; CHECK-ALU32-NEXT:  .Lfunc_begin0:
+; CHECK-ALU32-NEXT:    .loc 1 11 0 # test.c:11:0
+; CHECK-ALU32-NEXT:    .cfi_sections .debug_frame
+; CHECK-ALU32-NEXT:    .cfi_startproc
+; CHECK-ALU32-NEXT:  # %bb.0: # %entry
+; CHECK-ALU32-NEXT:    #DEBUG_VALUE: test:arg <- $r1
+; CHECK-ALU32-NEXT:  .Ltmp0:
+; CHECK-ALU32-NEXT:    r1 = 20
+; CHECK-ALU32-NEXT:  .Ltmp1:
+; CHECK-ALU32-NEXT:  .Ltmp2:
+; CHECK-ALU32-NEXT:  .Ltmp3:
+; CHECK-ALU32-NEXT:    r0 = 4
+; CHECK-ALU32-NEXT:  .Ltmp4:
+; CHECK-ALU32-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
+; CHECK-ALU32-NEXT:  .Ltmp5:
+; CHECK-ALU32-NEXT:  .Ltmp6:
+; CHECK-ALU32-NEXT:    w0 += w1
+; CHECK-ALU32-NEXT:  .Ltmp7:
+; CHECK-ALU32-NEXT:    r1 = 45
+; CHECK-ALU32-NEXT:    .loc 1 13 67 # test.c:13:67
+; CHECK-ALU32-NEXT:  .Ltmp8:
+; CHECK-ALU32-NEXT:    w0 += w1
+; CHECK-ALU32-NEXT:    # kill: def $w0 killed $w0 killed $r0
+; CHECK-ALU32-NEXT:    .loc 1 12 3 # test.c:12:3
+; CHECK-ALU32-NEXT:  .Ltmp9:
+; CHECK-ALU32-NEXT:    exit
+; CHECK-ALU32-NEXT:  .Ltmp10:
+; CHECK-ALU32-NEXT:  .Ltmp11:
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 5, i32 6), !dbg !32, !llvm.preserve.access.index !18
@@ -37,15 +99,6 @@ entry:
   ret i32 %add1, !dbg !38
 }
 
-; CHECK:             r1 = 20
-; CHECK:             r0 = 4
-; CHECK-ALU64:       r0 += r1
-; CHECK-ALU32:       w0 += w1
-; CHECK-EB:          r1 = 45
-; CHECK-ALU64:       r0 += r1
-; CHECK-ALU32:       w0 += w1
-; CHECK:             exit
-
 ; CHECK:             .long   1                       # BTF_KIND_STRUCT(id = 2)
 
 ; CHECK:             .byte   115                     # string offset=1
@@ -124,3 +177,4 @@ attributes #2 = { nounwind readnone speculatable }
 !36 = !DILocation(line: 14, column: 10, scope: !13)
 !37 = !DILocation(line: 13, column: 67, scope: !13)
 !38 = !DILocation(line: 12, column: 3, scope: !13)
+
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
index 98a333d54fd38e..66a1cf291cd4c0 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -1,6 +1,6 @@
-; RUN: opt -O2 %s | llvm-dis > %t1
-; RUN: llc -filetype=asm -o - %t1 | FileCheck -check-prefixes=CHECK,CHECK-EL,CHECK-ALU64 %s
-; RUN: llc -mattr=+alu32 -filetype=asm -o - %t1 | FileCheck -check-prefixes=CHECK,CHECK-EL,CHECK-ALU32 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -O2 -S < %s | llc -filetype=asm | FileCheck -check-prefixes=CHECK,CHECK-ALU64 %s
+; RUN: opt -O2 -S < %s | llc -mattr=+alu32 -filetype=asm | FileCheck -check-prefixes=CHECK,CHECK-ALU32 %s
 ; Source code:
 ;   struct s {
 ;     unsigned long long f1;
@@ -26,6 +26,68 @@ target triple = "bpfel"
 
 ; Function Attrs: nounwind readnone
 define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+; CHECK-ALU64-LABEL: test:
+; CHECK-ALU64:       .Ltest$local:
+; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
+; CHECK-ALU64-NEXT:  .Lfunc_begin0:
+; CHECK-ALU64-NEXT:    .loc 1 11 0 # test.c:11:0
+; CHECK-ALU64-NEXT:    .cfi_sections .debug_frame
+; CHECK-ALU64-NEXT:    .cfi_startproc
+; CHECK-ALU64-NEXT:  # %bb.0: # %entry
+; CHECK-ALU64-NEXT:    #DEBUG_VALUE: test:arg <- $r1
+; CHECK-ALU64-NEXT:  .Ltmp0:
+; CHECK-ALU64-NEXT:    r1 = 20
+; CHECK-ALU64-NEXT:  .Ltmp1:
+; CHECK-ALU64-NEXT:  .Ltmp2:
+; CHECK-ALU64-NEXT:  .Ltmp3:
+; CHECK-ALU64-NEXT:    r0 = 4
+; CHECK-ALU64-NEXT:  .Ltmp4:
+; CHECK-ALU64-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
+; CHECK-ALU64-NEXT:  .Ltmp5:
+; CHECK-ALU64-NEXT:  .Ltmp6:
+; CHECK-ALU64-NEXT:    r0 += r1
+; CHECK-ALU64-NEXT:  .Ltmp7:
+; CHECK-ALU64-NEXT:    r1 = 50
+; CHECK-ALU64-NEXT:    .loc 1 13 67 # test.c:13:67
+; CHECK-ALU64-NEXT:  .Ltmp8:
+; CHECK-ALU64-NEXT:    r0 += r1
+; CHECK-ALU64-NEXT:    .loc 1 12 3 # test.c:12:3
+; CHECK-ALU64-NEXT:  .Ltmp9:
+; CHECK-ALU64-NEXT:    exit
+; CHECK-ALU64-NEXT:  .Ltmp10:
+; CHECK-ALU64-NEXT:  .Ltmp11:
+;
+; CHECK-ALU32-LABEL: test:
+; CHECK-ALU32:       .Ltest$local:
+; CHECK-ALU32-NEXT:    .type .Ltest$local,@function
+; CHECK-ALU32-NEXT:  .Lfunc_begin0:
+; CHECK-ALU32-NEXT:    .loc 1 11 0 # test.c:11:0
+; CHECK-ALU32-NEXT:    .cfi_sections .debug_frame
+; CHECK-ALU32-NEXT:    .cfi_startproc
+; CHECK-ALU32-NEXT:  # %bb.0: # %entry
+; CHECK-ALU32-NEXT:    #DEBUG_VALUE: test:arg <- $r1
+; CHECK-ALU32-NEXT:  .Ltmp0:
+; CHECK-ALU32-NEXT:    r1 = 20
+; CHECK-ALU32-NEXT:  .Ltmp1:
+; CHECK-ALU32-NEXT:  .Ltmp2:
+; CHECK-ALU32-NEXT:  .Ltmp3:
+; CHECK-ALU32-NEXT:    r0 = 4
+; CHECK-ALU32-NEXT:  .Ltmp4:
+; CHECK-ALU32-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
+; CHECK-ALU32-NEXT:  .Ltmp5:
+; CHECK-ALU32-NEXT:  .Ltmp6:
+; CHECK-ALU32-NEXT:    w0 += w1
+; CHECK-ALU32-NEXT:  .Ltmp7:
+; CHECK-ALU32-NEXT:    r1 = 50
+; CHECK-ALU32-NEXT:    .loc 1 13 67 # test.c:13:67
+; CHECK-ALU32-NEXT:  .Ltmp8:
+; CHECK-ALU32-NEXT:    w0 += w1
+; CHECK-ALU32-NEXT:    # kill: def $w0 killed $w0 killed $r0
+; CHECK-ALU32-NEXT:    .loc 1 12 3 # test.c:12:3
+; CHECK-ALU32-NEXT:  .Ltmp9:
+; CHECK-ALU32-NEXT:    exit
+; CHECK-ALU32-NEXT:  .Ltmp10:
+; CHECK-ALU32-NEXT:  .Ltmp11:
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 5, i32 6), !dbg !32, !llvm.preserve.access.index !18
@@ -37,15 +99,6 @@ entry:
   ret i32 %add1, !dbg !38
 }
 
-; CHECK:             r1 = 20
-; CHECK:             r0 = 4
-; CHECK-ALU64:       r0 += r1
-; CHECK-ALU32:       w0 += w1
-; CHECK-EL:          r1 = 50
-; CHECK-ALU64:       r0 += r1
-; CHECK-ALU32:       w0 += w1
-; CHECK:             exit
-
 ; CHECK:             .long   1                       # BTF_KIND_STRUCT(id = 2)
 
 ; CHECK:             .byte   115                     # string offset=1
diff --git a/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir b/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
index d30fb39f874110..67f4dd72ea0b2d 100644
--- a/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
+++ b/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
@@ -153,8 +153,8 @@ body:             |
     %13 = S2_asl_r_p_acc %13, %47, %8.isub_lo
     %51 = A2_tfrpi 0
 
-    ; CHECK: $d0 = S2_extractup undef renamable $d0, 6, 25
-    ; CHECK: $d1 = A2_tfrpi 2
+    ; CHECK: $d2 = S2_extractup undef renamable $d0, 6, 25
+    ; CHECK: $d0 = A2_tfrpi 2
     ; CHECK: $d13 = A2_tfrpi -1
     ; CHECK-NOT: undef $r4
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-frame-index.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-frame-index.mir
new file mode 100644
index 00000000000000..1e1879ac21b908
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-frame-index.mir
@@ -0,0 +1,29 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+--- |
+
+  define ptr @alloca32() {
+  entry:
+    %ptr0 = alloca i32, align 4
+    ret ptr %ptr0
+  }
+
+...
+---
+name:            alloca32
+stack:
+  - { id: 0, name: ptr0, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: alloca32
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.ptr0
+    ; CHECK-NEXT: $x10 = COPY [[FRAME_INDEX]](p0)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(p0) = G_FRAME_INDEX %stack.0.ptr0
+    $x10 = COPY %0(p0)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-frame-index.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-frame-index.mir
new file mode 100644
index 00000000000000..6d083d8bde3cfc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-frame-index.mir
@@ -0,0 +1,29 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+--- |
+
+  define ptr @alloca64() {
+  entry:
+    %ptr0 = alloca i64, align 4
+    ret ptr %ptr0
+  }
+
+...
+---
+name:            alloca64
+stack:
+  - { id: 0, name: ptr0, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: alloca64
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.ptr0
+    ; CHECK-NEXT: $x10 = COPY [[FRAME_INDEX]](p0)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(p0) = G_FRAME_INDEX %stack.0.ptr0
+    $x10 = COPY %0(p0)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 29eaaee57868a8..c28594088dac5c 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -63,7 +63,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zifencei %s -o - | FileCheck --check-prefixes=CHECK,RV32ZIFENCEI %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zicntr %s -o - | FileCheck --check-prefixes=CHECK,RV32ZICNTR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zihpm %s -o - | FileCheck --check-prefixes=CHECK,RV32ZIHPM %s
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-zfa %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFA %s
+; RUN: llc -mtriple=riscv32 -mattr=+zfa %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+experimental-zvbb %s -o - | FileCheck --check-prefix=RV32ZVBB %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zve64x -mattr=+experimental-zvbc %s -o - | FileCheck --check-prefix=RV32ZVBC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+experimental-zvkb %s -o - | FileCheck --check-prefix=RV32ZVKB %s
@@ -153,7 +153,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zifencei %s -o - | FileCheck --check-prefixes=CHECK,RV64ZIFENCEI %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zicntr %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICNTR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zihpm %s -o - | FileCheck --check-prefixes=CHECK,RV64ZIHPM %s
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-zfa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFA %s
+; RUN: llc -mtriple=riscv64 -mattr=+zfa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+experimental-zvbb %s -o - | FileCheck --check-prefix=RV64ZVBB %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zve64x -mattr=+experimental-zvbc %s -o - | FileCheck --check-prefix=RV64ZVBC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+experimental-zvkb %s -o - | FileCheck --check-prefix=RV64ZVKB %s
@@ -243,7 +243,7 @@
 ; RV32ZIFENCEI: .attribute 5, "rv32i2p1_zifencei2p0"
 ; RV32ZICNTR: .attribute 5, "rv32i2p1_zicntr2p0_zicsr2p0"
 ; RV32ZIHPM: .attribute 5, "rv32i2p1_zicsr2p0_zihpm2p0"
-; RV32ZFA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfa0p2"
+; RV32ZFA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfa1p0"
 ; RV32ZVBB: .attribute 5, "rv32i2p1_zicsr2p0_zvbb1p0_zve32x1p0_zvkb1p0_zvl32b1p0"
 ; RV32ZVBC: .attribute 5, "rv32i2p1_zicsr2p0_zvbc1p0_zve32x1p0_zve64x1p0_zvl32b1p0_zvl64b1p0"
 ; RV32ZVKB: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvkb1p0_zvl32b1p0"
@@ -332,7 +332,7 @@
 ; RV64ZIFENCEI: .attribute 5, "rv64i2p1_zifencei2p0"
 ; RV64ZICNTR: .attribute 5, "rv64i2p1_zicntr2p0_zicsr2p0"
 ; RV64ZIHPM: .attribute 5, "rv64i2p1_zicsr2p0_zihpm2p0"
-; RV64ZFA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfa0p2"
+; RV64ZFA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfa1p0"
 ; RV64ZVBB: .attribute 5, "rv64i2p1_zicsr2p0_zvbb1p0_zve32x1p0_zvkb1p0_zvl32b1p0"
 ; RV64ZVBC: .attribute 5, "rv64i2p1_zicsr2p0_zvbc1p0_zve32x1p0_zve64x1p0_zvl32b1p0_zvl64b1p0"
 ; RV64ZVKB: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvkb1p0_zvl32b1p0"
diff --git a/llvm/test/CodeGen/RISCV/double-zfa.ll b/llvm/test/CodeGen/RISCV/double-zfa.ll
index 881430f6f5ffdd..904dd9f2ccbef9 100644
--- a/llvm/test/CodeGen/RISCV/double-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/double-zfa.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi ilp32d -mattr=+experimental-zfa,+d < %s \
+; RUN: llc -mtriple=riscv32 -target-abi ilp32d -mattr=+zfa,+d < %s \
 ; RUN:     | FileCheck --check-prefixes=CHECK,RV32IDZFA %s
-; RUN: llc -mtriple=riscv64 -target-abi lp64d -mattr=+experimental-zfa,+d < %s \
+; RUN: llc -mtriple=riscv64 -target-abi lp64d -mattr=+zfa,+d < %s \
 ; RUN:     | FileCheck --check-prefixes=CHECK,RV64DZFA %s
 
 define double @loadfpimm1() {
@@ -133,6 +133,25 @@ define double @loadfpimm15() {
   ret double 0x0008000000000000
 }
 
+define double @loadfpimm16() {
+; CHECK-LABEL: loadfpimm16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fli.d fa0, -1.0
+; CHECK-NEXT:    ret
+  ret double -1.0
+}
+
+; Ensure fli isn't incorrectly used for negated versions of numbers in the fli
+; table.
+define double @loadfpimm17() {
+; CHECK-LABEL: loadfpimm17:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI15_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI15_0)(a0)
+; CHECK-NEXT:    ret
+  ret double -2.0
+}
+
 declare double @llvm.minimum.f64(double, double)
 
 define double @fminm_d(double %a, double %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/fli-licm.ll b/llvm/test/CodeGen/RISCV/fli-licm.ll
index f37ace801b1595..4962a146362d56 100644
--- a/llvm/test/CodeGen/RISCV/fli-licm.ll
+++ b/llvm/test/CodeGen/RISCV/fli-licm.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=riscv32 -target-abi=ilp32f -mattr=+experimental-zfa \
+; RUN: llc < %s -mtriple=riscv32 -target-abi=ilp32f -mattr=+zfa \
 ; RUN:   | FileCheck %s --check-prefix=RV32
-; RUN: llc < %s -mtriple=riscv64 -target-abi=lp64f -mattr=+experimental-zfa \
+; RUN: llc < %s -mtriple=riscv64 -target-abi=lp64f -mattr=+zfa \
 ; RUN:   | FileCheck %s --check-prefix=RV64
 
 ; The purpose of this test is to check that an FLI instruction that
diff --git a/llvm/test/CodeGen/RISCV/float-zfa.ll b/llvm/test/CodeGen/RISCV/float-zfa.ll
index 94da29bd6bece8..baa45e00b565fb 100644
--- a/llvm/test/CodeGen/RISCV/float-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/float-zfa.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+experimental-zfa < %s \
+; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+zfa < %s \
 ; RUN:     | FileCheck %s
-; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+experimental-zfa < %s \
+; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+zfa < %s \
 ; RUN:     | FileCheck %s
 
 define float @loadfpimm1() {
@@ -87,6 +87,25 @@ define float @loadfpimm10() {
   ret float 0.00006103515625
 }
 
+define float @loadfpimm11() {
+; CHECK-LABEL: loadfpimm11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fli.s fa0, -1.0
+; CHECK-NEXT:    ret
+  ret float -1.0
+}
+
+; Ensure fli isn't incorrectly used for negated versions of numbers in the fli
+; table.
+define float @loadfpimm12() {
+; CHECK-LABEL: loadfpimm12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 786432
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+  ret float -2.0
+}
+
 declare float @llvm.minimum.f32(float, float)
 
 define float @fminm_s(float %a, float %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 279f4628147f27..434c9f05bd16f7 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -733,3 +733,238 @@ entry:
   %0 = load i32, ptr inttoptr (i64 2147481600 to ptr)
   ret i32 %0
 }
+
+%struct.S = type { i64, i64 }
+
+define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
+; RV32I-LABEL: fold_addi_from_different_bb:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a4
+; RV32I-NEXT:    mv s1, a3
+; RV32I-NEXT:    mv s2, a2
+; RV32I-NEXT:    beqz a3, .LBB20_3
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    slti a1, s1, 0
+; RV32I-NEXT:    beqz a1, .LBB20_4
+; RV32I-NEXT:  .LBB20_2:
+; RV32I-NEXT:    li s3, 0
+; RV32I-NEXT:    li s4, 0
+; RV32I-NEXT:    j .LBB20_6
+; RV32I-NEXT:  .LBB20_3:
+; RV32I-NEXT:    seqz a1, s2
+; RV32I-NEXT:    bnez a1, .LBB20_2
+; RV32I-NEXT:  .LBB20_4: # %for.body.lr.ph
+; RV32I-NEXT:    li s5, 0
+; RV32I-NEXT:    li s6, 0
+; RV32I-NEXT:    li s3, 0
+; RV32I-NEXT:    li s4, 0
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    add a0, s0, a0
+; RV32I-NEXT:    addi s7, a0, 8
+; RV32I-NEXT:  .LBB20_5: # %for.body
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call f@plt
+; RV32I-NEXT:    lw a0, 4(s7)
+; RV32I-NEXT:    lw a1, 0(s7)
+; RV32I-NEXT:    add a0, a0, s4
+; RV32I-NEXT:    add s3, a1, s3
+; RV32I-NEXT:    sltu s4, s3, a1
+; RV32I-NEXT:    addi s5, s5, 1
+; RV32I-NEXT:    seqz a1, s5
+; RV32I-NEXT:    add s6, s6, a1
+; RV32I-NEXT:    xor a1, s5, s2
+; RV32I-NEXT:    xor a2, s6, s1
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    add s4, a0, s4
+; RV32I-NEXT:    bnez a1, .LBB20_5
+; RV32I-NEXT:  .LBB20_6: # %for.cond.cleanup
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    mv a1, s4
+; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    ret
+;
+; RV32I-MEDIUM-LABEL: fold_addi_from_different_bb:
+; RV32I-MEDIUM:       # %bb.0: # %entry
+; RV32I-MEDIUM-NEXT:    addi sp, sp, -48
+; RV32I-MEDIUM-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    sw s5, 20(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
+; RV32I-MEDIUM-NEXT:    mv s0, a4
+; RV32I-MEDIUM-NEXT:    mv s1, a3
+; RV32I-MEDIUM-NEXT:    mv s2, a2
+; RV32I-MEDIUM-NEXT:    beqz a3, .LBB20_3
+; RV32I-MEDIUM-NEXT:  # %bb.1: # %entry
+; RV32I-MEDIUM-NEXT:    slti a1, s1, 0
+; RV32I-MEDIUM-NEXT:    beqz a1, .LBB20_4
+; RV32I-MEDIUM-NEXT:  .LBB20_2:
+; RV32I-MEDIUM-NEXT:    li s3, 0
+; RV32I-MEDIUM-NEXT:    li s4, 0
+; RV32I-MEDIUM-NEXT:    j .LBB20_6
+; RV32I-MEDIUM-NEXT:  .LBB20_3:
+; RV32I-MEDIUM-NEXT:    seqz a1, s2
+; RV32I-MEDIUM-NEXT:    bnez a1, .LBB20_2
+; RV32I-MEDIUM-NEXT:  .LBB20_4: # %for.body.lr.ph
+; RV32I-MEDIUM-NEXT:    li s5, 0
+; RV32I-MEDIUM-NEXT:    li s6, 0
+; RV32I-MEDIUM-NEXT:    li s3, 0
+; RV32I-MEDIUM-NEXT:    li s4, 0
+; RV32I-MEDIUM-NEXT:    slli a0, a0, 4
+; RV32I-MEDIUM-NEXT:    add a0, s0, a0
+; RV32I-MEDIUM-NEXT:    addi s7, a0, 8
+; RV32I-MEDIUM-NEXT:  .LBB20_5: # %for.body
+; RV32I-MEDIUM-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-MEDIUM-NEXT:    mv a0, s0
+; RV32I-MEDIUM-NEXT:    call f@plt
+; RV32I-MEDIUM-NEXT:    lw a0, 4(s7)
+; RV32I-MEDIUM-NEXT:    lw a1, 0(s7)
+; RV32I-MEDIUM-NEXT:    add a0, a0, s4
+; RV32I-MEDIUM-NEXT:    add s3, a1, s3
+; RV32I-MEDIUM-NEXT:    sltu s4, s3, a1
+; RV32I-MEDIUM-NEXT:    addi s5, s5, 1
+; RV32I-MEDIUM-NEXT:    seqz a1, s5
+; RV32I-MEDIUM-NEXT:    add s6, s6, a1
+; RV32I-MEDIUM-NEXT:    xor a1, s5, s2
+; RV32I-MEDIUM-NEXT:    xor a2, s6, s1
+; RV32I-MEDIUM-NEXT:    or a1, a1, a2
+; RV32I-MEDIUM-NEXT:    add s4, a0, s4
+; RV32I-MEDIUM-NEXT:    bnez a1, .LBB20_5
+; RV32I-MEDIUM-NEXT:  .LBB20_6: # %for.cond.cleanup
+; RV32I-MEDIUM-NEXT:    mv a0, s3
+; RV32I-MEDIUM-NEXT:    mv a1, s4
+; RV32I-MEDIUM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    lw s5, 20(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    lw s6, 16(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    lw s7, 12(sp) # 4-byte Folded Reload
+; RV32I-MEDIUM-NEXT:    addi sp, sp, 48
+; RV32I-MEDIUM-NEXT:    ret
+;
+; RV64I-LABEL: fold_addi_from_different_bb:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -48
+; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    blez a1, .LBB20_3
+; RV64I-NEXT:  # %bb.1: # %for.body.lr.ph
+; RV64I-NEXT:    mv s0, a2
+; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    li s2, 0
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    addi s3, a0, 8
+; RV64I-NEXT:  .LBB20_2: # %for.body
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call f@plt
+; RV64I-NEXT:    ld a0, 0(s3)
+; RV64I-NEXT:    addi s1, s1, -1
+; RV64I-NEXT:    add s2, a0, s2
+; RV64I-NEXT:    bnez s1, .LBB20_2
+; RV64I-NEXT:    j .LBB20_4
+; RV64I-NEXT:  .LBB20_3:
+; RV64I-NEXT:    li s2, 0
+; RV64I-NEXT:  .LBB20_4: # %for.cond.cleanup
+; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 48
+; RV64I-NEXT:    ret
+;
+; RV64I-MEDIUM-LABEL: fold_addi_from_different_bb:
+; RV64I-MEDIUM:       # %bb.0: # %entry
+; RV64I-MEDIUM-NEXT:    addi sp, sp, -48
+; RV64I-MEDIUM-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64I-MEDIUM-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64I-MEDIUM-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64I-MEDIUM-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64I-MEDIUM-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-MEDIUM-NEXT:    blez a1, .LBB20_3
+; RV64I-MEDIUM-NEXT:  # %bb.1: # %for.body.lr.ph
+; RV64I-MEDIUM-NEXT:    mv s0, a2
+; RV64I-MEDIUM-NEXT:    mv s1, a1
+; RV64I-MEDIUM-NEXT:    li s2, 0
+; RV64I-MEDIUM-NEXT:    slli a0, a0, 4
+; RV64I-MEDIUM-NEXT:    add a0, a2, a0
+; RV64I-MEDIUM-NEXT:    addi s3, a0, 8
+; RV64I-MEDIUM-NEXT:  .LBB20_2: # %for.body
+; RV64I-MEDIUM-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-MEDIUM-NEXT:    mv a0, s0
+; RV64I-MEDIUM-NEXT:    call f@plt
+; RV64I-MEDIUM-NEXT:    ld a0, 0(s3)
+; RV64I-MEDIUM-NEXT:    addi s1, s1, -1
+; RV64I-MEDIUM-NEXT:    add s2, a0, s2
+; RV64I-MEDIUM-NEXT:    bnez s1, .LBB20_2
+; RV64I-MEDIUM-NEXT:    j .LBB20_4
+; RV64I-MEDIUM-NEXT:  .LBB20_3:
+; RV64I-MEDIUM-NEXT:    li s2, 0
+; RV64I-MEDIUM-NEXT:  .LBB20_4: # %for.cond.cleanup
+; RV64I-MEDIUM-NEXT:    mv a0, s2
+; RV64I-MEDIUM-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64I-MEDIUM-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64I-MEDIUM-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64I-MEDIUM-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64I-MEDIUM-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64I-MEDIUM-NEXT:    addi sp, sp, 48
+; RV64I-MEDIUM-NEXT:    ret
+entry:
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  ; TODO: when this GEP is expanded, the resulting `addi` should be folded
+  ; into the load in the loop body.
+  %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  ret i64 %s.0.lcssa
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.06 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %s.05 = phi i64 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  call void @f(ptr %a)
+  %0 = load i64, ptr %y, align 8
+  %add = add nsw i64 %0, %s.05
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare void @f(ptr)
diff --git a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
index f82ae5c00d1596..2805a8e582b34b 100644
--- a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
+++ b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+experimental-zfa,+zfh < %s \
+; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+zfa,+zfh < %s \
 ; RUN:     | FileCheck %s
-; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+experimental-zfa,+zfh < %s \
+; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+zfa,+zfh < %s \
 ; RUN:     | FileCheck %s
-; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+experimental-zfa,+zfhmin < %s \
+; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+zfa,+zfhmin < %s \
 ; RUN:     | FileCheck %s --check-prefix=ZFHMIN
-; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+experimental-zfa,+zfhmin < %s \
+; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+zfa,+zfhmin < %s \
 ; RUN:     | FileCheck %s --check-prefix=ZFHMIN
 
 define half @loadfpimm1() {
@@ -180,3 +180,34 @@ define half @loadfpimm12() {
 ; ZFHMIN-NEXT:    ret
   ret half 0xH7c01
 }
+
+define half @loadfpimm13() {
+; CHECK-LABEL: loadfpimm13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fli.h fa0, -1.0
+; CHECK-NEXT:    ret
+;
+; ZFHMIN-LABEL: loadfpimm13:
+; ZFHMIN:       # %bb.0:
+; ZFHMIN-NEXT:    lui a0, %hi(.LCPI12_0)
+; ZFHMIN-NEXT:    flh fa0, %lo(.LCPI12_0)(a0)
+; ZFHMIN-NEXT:    ret
+  ret half -1.0
+}
+
+; Ensure fli isn't incorrectly used for negated versions of numbers in the fli
+; table.
+define half @loadfpimm14() {
+; CHECK-LABEL: loadfpimm14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048572
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+;
+; ZFHMIN-LABEL: loadfpimm14:
+; ZFHMIN:       # %bb.0:
+; ZFHMIN-NEXT:    lui a0, 1048572
+; ZFHMIN-NEXT:    fmv.h.x fa0, a0
+; ZFHMIN-NEXT:    ret
+  ret half -2.0
+}
diff --git a/llvm/test/CodeGen/RISCV/half-zfa.ll b/llvm/test/CodeGen/RISCV/half-zfa.ll
index 732075e186b29f..93ffcb8a1a05c2 100644
--- a/llvm/test/CodeGen/RISCV/half-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/half-zfa.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+experimental-zfa,+zfh < %s \
+; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+zfa,+zfh < %s \
 ; RUN:     | FileCheck %s
-; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+experimental-zfa,+zfh < %s \
+; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+zfa,+zfh < %s \
 ; RUN:     | FileCheck %s
 
 declare half @llvm.minimum.f16(half, half)
diff --git a/llvm/test/CodeGen/RISCV/imm.ll b/llvm/test/CodeGen/RISCV/imm.ll
index ae66af72679759..738318e4bd6774 100644
--- a/llvm/test/CodeGen/RISCV/imm.ll
+++ b/llvm/test/CodeGen/RISCV/imm.ll
@@ -1157,60 +1157,44 @@ define i64 @imm_2reg_1() nounwind {
 ; RV32I-NEXT:    lui a1, 983040
 ; RV32I-NEXT:    ret
 ;
-; RV64-NOPOOL-LABEL: imm_2reg_1:
-; RV64-NOPOOL:       # %bb.0:
-; RV64-NOPOOL-NEXT:    lui a0, 1048430
-; RV64-NOPOOL-NEXT:    addiw a0, a0, 1493
-; RV64-NOPOOL-NEXT:    slli a0, a0, 13
-; RV64-NOPOOL-NEXT:    addi a0, a0, -1921
-; RV64-NOPOOL-NEXT:    srli a0, a0, 4
-; RV64-NOPOOL-NEXT:    not a0, a0
-; RV64-NOPOOL-NEXT:    ret
-;
-; RV64I-POOL-LABEL: imm_2reg_1:
-; RV64I-POOL:       # %bb.0:
-; RV64I-POOL-NEXT:    lui a0, %hi(.LCPI27_0)
-; RV64I-POOL-NEXT:    ld a0, %lo(.LCPI27_0)(a0)
-; RV64I-POOL-NEXT:    ret
+; RV64I-LABEL: imm_2reg_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 74565
+; RV64I-NEXT:    addiw a0, a0, 1656
+; RV64I-NEXT:    slli a1, a0, 57
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
 ;
 ; RV64IZBA-LABEL: imm_2reg_1:
 ; RV64IZBA:       # %bb.0:
-; RV64IZBA-NEXT:    lui a0, 1048430
-; RV64IZBA-NEXT:    addiw a0, a0, 1493
-; RV64IZBA-NEXT:    slli a0, a0, 13
-; RV64IZBA-NEXT:    addi a0, a0, -1921
-; RV64IZBA-NEXT:    srli a0, a0, 4
-; RV64IZBA-NEXT:    not a0, a0
+; RV64IZBA-NEXT:    lui a0, 74565
+; RV64IZBA-NEXT:    addiw a0, a0, 1656
+; RV64IZBA-NEXT:    slli a1, a0, 57
+; RV64IZBA-NEXT:    add a0, a0, a1
 ; RV64IZBA-NEXT:    ret
 ;
 ; RV64IZBB-LABEL: imm_2reg_1:
 ; RV64IZBB:       # %bb.0:
-; RV64IZBB-NEXT:    lui a0, 1048430
-; RV64IZBB-NEXT:    addiw a0, a0, 1493
-; RV64IZBB-NEXT:    slli a0, a0, 13
-; RV64IZBB-NEXT:    addi a0, a0, -1921
-; RV64IZBB-NEXT:    srli a0, a0, 4
-; RV64IZBB-NEXT:    not a0, a0
+; RV64IZBB-NEXT:    lui a0, 74565
+; RV64IZBB-NEXT:    addiw a0, a0, 1656
+; RV64IZBB-NEXT:    slli a1, a0, 57
+; RV64IZBB-NEXT:    add a0, a0, a1
 ; RV64IZBB-NEXT:    ret
 ;
 ; RV64IZBS-LABEL: imm_2reg_1:
 ; RV64IZBS:       # %bb.0:
-; RV64IZBS-NEXT:    lui a0, 1048430
-; RV64IZBS-NEXT:    addiw a0, a0, 1493
-; RV64IZBS-NEXT:    slli a0, a0, 13
-; RV64IZBS-NEXT:    addi a0, a0, -1921
-; RV64IZBS-NEXT:    srli a0, a0, 4
-; RV64IZBS-NEXT:    not a0, a0
+; RV64IZBS-NEXT:    lui a0, 74565
+; RV64IZBS-NEXT:    addiw a0, a0, 1656
+; RV64IZBS-NEXT:    slli a1, a0, 57
+; RV64IZBS-NEXT:    add a0, a0, a1
 ; RV64IZBS-NEXT:    ret
 ;
 ; RV64IXTHEADBB-LABEL: imm_2reg_1:
 ; RV64IXTHEADBB:       # %bb.0:
-; RV64IXTHEADBB-NEXT:    lui a0, 1048430
-; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1493
-; RV64IXTHEADBB-NEXT:    slli a0, a0, 13
-; RV64IXTHEADBB-NEXT:    addi a0, a0, -1921
-; RV64IXTHEADBB-NEXT:    srli a0, a0, 4
-; RV64IXTHEADBB-NEXT:    not a0, a0
+; RV64IXTHEADBB-NEXT:    lui a0, 74565
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1656
+; RV64IXTHEADBB-NEXT:    slli a1, a0, 57
+; RV64IXTHEADBB-NEXT:    add a0, a0, a1
 ; RV64IXTHEADBB-NEXT:    ret
   ret i64 -1152921504301427080 ; 0xF000_0000_1234_5678
 }
@@ -1703,40 +1687,28 @@ define i64 @imm_neg_9223372034778874949() {
 ; RV32I-NEXT:    lui a1, 524288
 ; RV32I-NEXT:    ret
 ;
-; RV64-NOPOOL-LABEL: imm_neg_9223372034778874949:
-; RV64-NOPOOL:       # %bb.0:
-; RV64-NOPOOL-NEXT:    lui a0, 1048329
-; RV64-NOPOOL-NEXT:    addiw a0, a0, -1911
-; RV64-NOPOOL-NEXT:    slli a0, a0, 12
-; RV64-NOPOOL-NEXT:    addi a0, a0, -1911
-; RV64-NOPOOL-NEXT:    srli a0, a0, 1
-; RV64-NOPOOL-NEXT:    not a0, a0
-; RV64-NOPOOL-NEXT:    ret
-;
-; RV64I-POOL-LABEL: imm_neg_9223372034778874949:
-; RV64I-POOL:       # %bb.0:
-; RV64I-POOL-NEXT:    lui a0, %hi(.LCPI38_0)
-; RV64I-POOL-NEXT:    ld a0, %lo(.LCPI38_0)(a0)
-; RV64I-POOL-NEXT:    ret
+; RV64I-LABEL: imm_neg_9223372034778874949:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 506812
+; RV64I-NEXT:    addiw a0, a0, -1093
+; RV64I-NEXT:    slli a1, a0, 63
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
 ;
 ; RV64IZBA-LABEL: imm_neg_9223372034778874949:
 ; RV64IZBA:       # %bb.0:
-; RV64IZBA-NEXT:    lui a0, 1048329
-; RV64IZBA-NEXT:    addiw a0, a0, -1911
-; RV64IZBA-NEXT:    slli a0, a0, 12
-; RV64IZBA-NEXT:    addi a0, a0, -1911
-; RV64IZBA-NEXT:    srli a0, a0, 1
-; RV64IZBA-NEXT:    not a0, a0
+; RV64IZBA-NEXT:    lui a0, 506812
+; RV64IZBA-NEXT:    addiw a0, a0, -1093
+; RV64IZBA-NEXT:    slli a1, a0, 63
+; RV64IZBA-NEXT:    add a0, a0, a1
 ; RV64IZBA-NEXT:    ret
 ;
 ; RV64IZBB-LABEL: imm_neg_9223372034778874949:
 ; RV64IZBB:       # %bb.0:
-; RV64IZBB-NEXT:    lui a0, 1048329
-; RV64IZBB-NEXT:    addiw a0, a0, -1911
-; RV64IZBB-NEXT:    slli a0, a0, 12
-; RV64IZBB-NEXT:    addi a0, a0, -1911
-; RV64IZBB-NEXT:    srli a0, a0, 1
-; RV64IZBB-NEXT:    not a0, a0
+; RV64IZBB-NEXT:    lui a0, 506812
+; RV64IZBB-NEXT:    addiw a0, a0, -1093
+; RV64IZBB-NEXT:    slli a1, a0, 63
+; RV64IZBB-NEXT:    add a0, a0, a1
 ; RV64IZBB-NEXT:    ret
 ;
 ; RV64IZBS-LABEL: imm_neg_9223372034778874949:
@@ -1748,12 +1720,10 @@ define i64 @imm_neg_9223372034778874949() {
 ;
 ; RV64IXTHEADBB-LABEL: imm_neg_9223372034778874949:
 ; RV64IXTHEADBB:       # %bb.0:
-; RV64IXTHEADBB-NEXT:    lui a0, 1048329
-; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1911
-; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
-; RV64IXTHEADBB-NEXT:    addi a0, a0, -1911
-; RV64IXTHEADBB-NEXT:    srli a0, a0, 1
-; RV64IXTHEADBB-NEXT:    not a0, a0
+; RV64IXTHEADBB-NEXT:    lui a0, 506812
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    slli a1, a0, 63
+; RV64IXTHEADBB-NEXT:    add a0, a0, a1
 ; RV64IXTHEADBB-NEXT:    ret
   ret i64 -9223372034778874949 ; 0x800000007bbbbbbb
 }
@@ -1932,29 +1902,26 @@ define i64 @imm_9223372034904144827() {
 ;
 ; RV64I-LABEL: imm_9223372034904144827:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a0, 1048343
-; RV64I-NEXT:    addiw a0, a0, 1911
-; RV64I-NEXT:    slli a0, a0, 12
-; RV64I-NEXT:    addi a0, a0, 1911
-; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    lui a0, 572348
+; RV64I-NEXT:    addiw a0, a0, -1093
+; RV64I-NEXT:    slli a1, a0, 63
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IZBA-LABEL: imm_9223372034904144827:
 ; RV64IZBA:       # %bb.0:
-; RV64IZBA-NEXT:    lui a0, 1048343
-; RV64IZBA-NEXT:    addiw a0, a0, 1911
-; RV64IZBA-NEXT:    slli a0, a0, 12
-; RV64IZBA-NEXT:    addi a0, a0, 1911
-; RV64IZBA-NEXT:    srli a0, a0, 1
+; RV64IZBA-NEXT:    lui a0, 572348
+; RV64IZBA-NEXT:    addiw a0, a0, -1093
+; RV64IZBA-NEXT:    slli a1, a0, 63
+; RV64IZBA-NEXT:    add a0, a0, a1
 ; RV64IZBA-NEXT:    ret
 ;
 ; RV64IZBB-LABEL: imm_9223372034904144827:
 ; RV64IZBB:       # %bb.0:
-; RV64IZBB-NEXT:    lui a0, 1048343
-; RV64IZBB-NEXT:    addiw a0, a0, 1911
-; RV64IZBB-NEXT:    slli a0, a0, 12
-; RV64IZBB-NEXT:    addi a0, a0, 1911
-; RV64IZBB-NEXT:    srli a0, a0, 1
+; RV64IZBB-NEXT:    lui a0, 572348
+; RV64IZBB-NEXT:    addiw a0, a0, -1093
+; RV64IZBB-NEXT:    slli a1, a0, 63
+; RV64IZBB-NEXT:    add a0, a0, a1
 ; RV64IZBB-NEXT:    ret
 ;
 ; RV64IZBS-LABEL: imm_9223372034904144827:
@@ -1966,11 +1933,10 @@ define i64 @imm_9223372034904144827() {
 ;
 ; RV64IXTHEADBB-LABEL: imm_9223372034904144827:
 ; RV64IXTHEADBB:       # %bb.0:
-; RV64IXTHEADBB-NEXT:    lui a0, 1048343
-; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1911
-; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
-; RV64IXTHEADBB-NEXT:    addi a0, a0, 1911
-; RV64IXTHEADBB-NEXT:    srli a0, a0, 1
+; RV64IXTHEADBB-NEXT:    lui a0, 572348
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    slli a1, a0, 63
+; RV64IXTHEADBB-NEXT:    add a0, a0, a1
 ; RV64IXTHEADBB-NEXT:    ret
   ret i64 9223372034904144827 ; 0x7fffffff8bbbbbbb
 }
diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll
new file mode 100644
index 00000000000000..b2dea4237f5a5a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
+
+; Negative test to ensure we don't try to generate a vector reduce when
+; vector instructions are not available.
+
+define i32 @reduce_sum_4xi32(<4 x i32> %v) {
+; RV32-LABEL: reduce_sum_4xi32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a0, 8(a0)
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_sum_4xi32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 24(a0)
+; RV64-NEXT:    lw a2, 8(a0)
+; RV64-NEXT:    lw a3, 0(a0)
+; RV64-NEXT:    lw a0, 16(a0)
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a2, a0
+; RV64-NEXT:    ret
+  %e0 = extractelement <4 x i32> %v, i32 0
+  %e1 = extractelement <4 x i32> %v, i32 1
+  %e2 = extractelement <4 x i32> %v, i32 2
+  %e3 = extractelement <4 x i32> %v, i32 3
+  %add0 = add i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
+  %add2 = add i32 %add1, %e3
+  ret i32 %add2
+}
+
+define i32 @reduce_xor_4xi32(<4 x i32> %v) {
+; RV32-LABEL: reduce_xor_4xi32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a0, 8(a0)
+; RV32-NEXT:    xor a2, a3, a2
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_xor_4xi32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a1, 24(a0)
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a0, 16(a0)
+; RV64-NEXT:    xor a2, a3, a2
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ret
+  %e0 = extractelement <4 x i32> %v, i32 0
+  %e1 = extractelement <4 x i32> %v, i32 1
+  %e2 = extractelement <4 x i32> %v, i32 2
+  %e3 = extractelement <4 x i32> %v, i32 3
+  %xor0 = xor i32 %e0, %e1
+  %xor1 = xor i32 %xor0, %e2
+  %xor2 = xor i32 %xor1, %e3
+  ret i32 %xor2
+}
+
+define i32 @reduce_or_4xi32(<4 x i32> %v) {
+; RV32-LABEL: reduce_or_4xi32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 12(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a0, 8(a0)
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_or_4xi32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a1, 24(a0)
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a0, 16(a0)
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    ret
+  %e0 = extractelement <4 x i32> %v, i32 0
+  %e1 = extractelement <4 x i32> %v, i32 1
+  %e2 = extractelement <4 x i32> %v, i32 2
+  %e3 = extractelement <4 x i32> %v, i32 3
+  %or0 = or i32 %e0, %e1
+  %or1 = or i32 %or0, %e2
+  %or2 = or i32 %or1, %e3
+  ret i32 %or2
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
index e8d3ec1b128ecf..07a4c093f06e68 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
@@ -61,11 +61,10 @@ define i64 @orcb64_knownbits(i64 %a) nounwind {
 ; RV64ZBB-NEXT:    lui a1, 65535
 ; RV64ZBB-NEXT:    slli a1, a1, 12
 ; RV64ZBB-NEXT:    and a0, a0, a1
-; RV64ZBB-NEXT:    lui a1, 131073
-; RV64ZBB-NEXT:    slli a1, a1, 13
-; RV64ZBB-NEXT:    addi a1, a1, 1
-; RV64ZBB-NEXT:    slli a1, a1, 20
-; RV64ZBB-NEXT:    addi a1, a1, 8
+; RV64ZBB-NEXT:    lui a1, 256
+; RV64ZBB-NEXT:    addiw a1, a1, 8
+; RV64ZBB-NEXT:    slli a2, a1, 42
+; RV64ZBB-NEXT:    add a1, a1, a2
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    orc.b a0, a0
 ; RV64ZBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
index 690ecc6eab33b3..884bb206a31ebe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
 
 define <vscale x 1 x i16> @bswap_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-LABEL: bswap_nxv1i16:
@@ -13,11 +13,11 @@ define <vscale x 1 x i16> @bswap_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 1 x i16> @llvm.bswap.nxv1i16(<vscale x 1 x i16> %va)
   ret <vscale x 1 x i16> %a
 }
@@ -32,11 +32,11 @@ define <vscale x 2 x i16> @bswap_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 2 x i16> @llvm.bswap.nxv2i16(<vscale x 2 x i16> %va)
   ret <vscale x 2 x i16> %a
 }
@@ -51,11 +51,11 @@ define <vscale x 4 x i16> @bswap_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 4 x i16> @llvm.bswap.nxv4i16(<vscale x 4 x i16> %va)
   ret <vscale x 4 x i16> %a
 }
@@ -70,11 +70,11 @@ define <vscale x 8 x i16> @bswap_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 8 x i16> @llvm.bswap.nxv8i16(<vscale x 8 x i16> %va)
   ret <vscale x 8 x i16> %a
 }
@@ -89,11 +89,11 @@ define <vscale x 16 x i16> @bswap_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 16 x i16> @llvm.bswap.nxv16i16(<vscale x 16 x i16> %va)
   ret <vscale x 16 x i16> %a
 }
@@ -108,11 +108,11 @@ define <vscale x 32 x i16> @bswap_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 32 x i16> @llvm.bswap.nxv32i16(<vscale x 32 x i16> %va)
   ret <vscale x 32 x i16> %a
 }
@@ -151,11 +151,11 @@ define <vscale x 1 x i32> @bswap_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 1 x i32> @llvm.bswap.nxv1i32(<vscale x 1 x i32> %va)
   ret <vscale x 1 x i32> %a
 }
@@ -194,11 +194,11 @@ define <vscale x 2 x i32> @bswap_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 2 x i32> @llvm.bswap.nxv2i32(<vscale x 2 x i32> %va)
   ret <vscale x 2 x i32> %a
 }
@@ -237,11 +237,11 @@ define <vscale x 4 x i32> @bswap_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 4 x i32> @llvm.bswap.nxv4i32(<vscale x 4 x i32> %va)
   ret <vscale x 4 x i32> %a
 }
@@ -280,11 +280,11 @@ define <vscale x 8 x i32> @bswap_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 8 x i32> @llvm.bswap.nxv8i32(<vscale x 8 x i32> %va)
   ret <vscale x 8 x i32> %a
 }
@@ -323,11 +323,11 @@ define <vscale x 16 x i32> @bswap_nxv16i32(<vscale x 16 x i32> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.bswap.nxv16i32(<vscale x 16 x i32> %va)
   ret <vscale x 16 x i32> %a
 }
@@ -406,11 +406,11 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.bswap.nxv1i64(<vscale x 1 x i64> %va)
   ret <vscale x 1 x i64> %a
 }
@@ -489,11 +489,11 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.bswap.nxv2i64(<vscale x 2 x i64> %va)
   ret <vscale x 2 x i64> %a
 }
@@ -572,11 +572,11 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.bswap.nxv4i64(<vscale x 4 x i64> %va)
   ret <vscale x 4 x i64> %a
 }
@@ -669,11 +669,11 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: bswap_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: bswap_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.bswap.nxv8i64(<vscale x 8 x i64> %va)
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index be79222b5c5a7f..3c4ff5e7d1cefc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -3,10 +3,10 @@
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb,+m -target-abi=ilp32d \
-; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb,+m -target-abi=lp64d \
-; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb,+m -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb,+m -target-abi=lp64d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
 
 declare <vscale x 1 x i16> @llvm.vp.bswap.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i1>, i32)
 
@@ -19,11 +19,11 @@ define <vscale x 1 x i16> @vp_bswap_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 1 x i16> @llvm.vp.bswap.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i16> %v
 }
@@ -37,11 +37,11 @@ define <vscale x 1 x i16> @vp_bswap_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv1i16_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv1i16_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i16> @llvm.vp.bswap.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -59,11 +59,11 @@ define <vscale x 2 x i16> @vp_bswap_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.bswap.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i16> %v
 }
@@ -77,11 +77,11 @@ define <vscale x 2 x i16> @vp_bswap_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv2i16_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv2i16_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x i16> @llvm.vp.bswap.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -99,11 +99,11 @@ define <vscale x 4 x i16> @vp_bswap_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 4 x i16> @llvm.vp.bswap.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i16> %v
 }
@@ -117,11 +117,11 @@ define <vscale x 4 x i16> @vp_bswap_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv4i16_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv4i16_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x i16> @llvm.vp.bswap.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 %evl)
@@ -139,11 +139,11 @@ define <vscale x 8 x i16> @vp_bswap_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 8 x i16> @llvm.vp.bswap.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i16> %v
 }
@@ -157,11 +157,11 @@ define <vscale x 8 x i16> @vp_bswap_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv8i16_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv8i16_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i16> @llvm.vp.bswap.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 %evl)
@@ -179,11 +179,11 @@ define <vscale x 16 x i16> @vp_bswap_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 16 x i16> @llvm.vp.bswap.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i16> %v
 }
@@ -197,11 +197,11 @@ define <vscale x 16 x i16> @vp_bswap_nxv16i16_unmasked(<vscale x 16 x i16> %va,
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv16i16_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv16i16_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x i16> @llvm.vp.bswap.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 %evl)
@@ -219,11 +219,11 @@ define <vscale x 32 x i16> @vp_bswap_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.bswap.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
 }
@@ -237,11 +237,11 @@ define <vscale x 32 x i16> @vp_bswap_nxv32i16_unmasked(<vscale x 32 x i16> %va,
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv32i16_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv32i16_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x i16> @llvm.vp.bswap.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 %evl)
@@ -283,11 +283,11 @@ define <vscale x 1 x i32> @vp_bswap_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 1 x i32> @llvm.vp.bswap.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i32> %v
 }
@@ -325,11 +325,11 @@ define <vscale x 1 x i32> @vp_bswap_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv1i32_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv1i32_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i32> @llvm.vp.bswap.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -371,11 +371,11 @@ define <vscale x 2 x i32> @vp_bswap_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.bswap.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i32> %v
 }
@@ -413,11 +413,11 @@ define <vscale x 2 x i32> @vp_bswap_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv2i32_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv2i32_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x i32> @llvm.vp.bswap.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -459,11 +459,11 @@ define <vscale x 4 x i32> @vp_bswap_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.vp.bswap.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i32> %v
 }
@@ -501,11 +501,11 @@ define <vscale x 4 x i32> @vp_bswap_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv4i32_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv4i32_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x i32> @llvm.vp.bswap.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 %evl)
@@ -547,11 +547,11 @@ define <vscale x 8 x i32> @vp_bswap_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 8 x i32> @llvm.vp.bswap.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i32> %v
 }
@@ -589,11 +589,11 @@ define <vscale x 8 x i32> @vp_bswap_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv8i32_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv8i32_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i32> @llvm.vp.bswap.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 %evl)
@@ -635,11 +635,11 @@ define <vscale x 16 x i32> @vp_bswap_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vp.bswap.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i32> %v
 }
@@ -677,11 +677,11 @@ define <vscale x 16 x i32> @vp_bswap_nxv16i32_unmasked(<vscale x 16 x i32> %va,
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv16i32_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv16i32_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x i32> @llvm.vp.bswap.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 %evl)
@@ -765,11 +765,11 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 1 x i64> @llvm.vp.bswap.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i64> %v
 }
@@ -849,11 +849,11 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv1i64_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv1i64_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x i64> @llvm.vp.bswap.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -937,11 +937,11 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.bswap.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x i64> %v
 }
@@ -1021,11 +1021,11 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv2i64_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv2i64_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x i64> @llvm.vp.bswap.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -1109,11 +1109,11 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 4 x i64> @llvm.vp.bswap.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x i64> %v
 }
@@ -1193,11 +1193,11 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv4i64_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv4i64_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x i64> @llvm.vp.bswap.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 %evl)
@@ -1345,11 +1345,11 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv7i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv7i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 7 x i64> @llvm.vp.bswap.nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x i64> %v
 }
@@ -1442,11 +1442,11 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv7i64_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv7i64_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 7 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 7 x i1> %head, <vscale x 7 x i1> poison, <vscale x 7 x i32> zeroinitializer
   %v = call <vscale x 7 x i64> @llvm.vp.bswap.nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 %evl)
@@ -1594,11 +1594,11 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 8 x i64> @llvm.vp.bswap.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x i64> %v
 }
@@ -1691,11 +1691,11 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv8i64_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv8i64_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x i64> @llvm.vp.bswap.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 %evl)
@@ -1757,28 +1757,28 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv64i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    srli a2, a1, 1
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    slli a1, a1, 2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v16, v16, v0.t
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB32_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB32_2:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vmv1r.v v0, v24
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv64i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vmv1r.v v24, v0
+; CHECK-ZVKB-NEXT:    csrr a1, vlenb
+; CHECK-ZVKB-NEXT:    srli a2, a1, 1
+; CHECK-ZVKB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-ZVKB-NEXT:    slli a1, a1, 2
+; CHECK-ZVKB-NEXT:    sub a2, a0, a1
+; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVKB-NEXT:    addi a3, a3, -1
+; CHECK-ZVKB-NEXT:    and a2, a3, a2
+; CHECK-ZVKB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v16, v16, v0.t
+; CHECK-ZVKB-NEXT:    bltu a0, a1, .LBB32_2
+; CHECK-ZVKB-NEXT:  # %bb.1:
+; CHECK-ZVKB-NEXT:    mv a0, a1
+; CHECK-ZVKB-NEXT:  .LBB32_2:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vmv1r.v v0, v24
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 64 x i16> @llvm.vp.bswap.nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i16> %v
 }
@@ -1806,23 +1806,23 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
 ; CHECK-NEXT:    vor.vv v8, v8, v24
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv64i16_unmasked:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    csrr a1, vlenb
-; CHECK-ZVBB-NEXT:    slli a1, a1, 2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
-; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v16, v16
-; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB33_2
-; CHECK-ZVBB-NEXT:  # %bb.1:
-; CHECK-ZVBB-NEXT:    mv a0, a1
-; CHECK-ZVBB-NEXT:  .LBB33_2:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv64i16_unmasked:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    csrr a1, vlenb
+; CHECK-ZVKB-NEXT:    slli a1, a1, 2
+; CHECK-ZVKB-NEXT:    sub a2, a0, a1
+; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVKB-NEXT:    addi a3, a3, -1
+; CHECK-ZVKB-NEXT:    and a2, a3, a2
+; CHECK-ZVKB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v16, v16
+; CHECK-ZVKB-NEXT:    bltu a0, a1, .LBB33_2
+; CHECK-ZVKB-NEXT:  # %bb.1:
+; CHECK-ZVKB-NEXT:    mv a0, a1
+; CHECK-ZVKB-NEXT:  .LBB33_2:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
   %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
   %v = call <vscale x 64 x i16> @llvm.vp.bswap.nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 %evl)
@@ -1908,12 +1908,12 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV64-NEXT:    vsrl.vi v8, v8, 16, v0.t
 ; RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vp_bswap_nxv1i48:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrev8.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT:    vsrl.vi v8, v8, 16, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vp_bswap_nxv1i48:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
+; CHECK-ZVKB-NEXT:    vsrl.vi v8, v8, 16, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %v = call <vscale x 1 x i48> @llvm.vp.bswap.nxv1i48(<vscale x 1 x i48> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i48> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
index 7fa1f4270525dc..4d78da2d647603 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,LMULMAX2-RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,LMULMAX1-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,LMULMAX1-RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVKB
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVKB
 
 define void @bswap_v8i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: bswap_v8i16:
@@ -17,13 +17,13 @@ define void @bswap_v8i16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: bswap_v8i16:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vrev8.v v8, v8
-; ZVBB-NEXT:    vse16.v v8, (a0)
-; ZVBB-NEXT:    ret
+; ZVKB-LABEL: bswap_v8i16:
+; ZVKB:       # %bb.0:
+; ZVKB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVKB-NEXT:    vle16.v v8, (a0)
+; ZVKB-NEXT:    vrev8.v v8, v8
+; ZVKB-NEXT:    vse16.v v8, (a0)
+; ZVKB-NEXT:    ret
   %a = load <8 x i16>, ptr %x
   %b = load <8 x i16>, ptr %y
   %c = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a)
@@ -69,13 +69,13 @@ define void @bswap_v4i32(ptr %x, ptr %y) {
 ; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
-; ZVBB-LABEL: bswap_v4i32:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vrev8.v v8, v8
-; ZVBB-NEXT:    vse32.v v8, (a0)
-; ZVBB-NEXT:    ret
+; ZVKB-LABEL: bswap_v4i32:
+; ZVKB:       # %bb.0:
+; ZVKB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVKB-NEXT:    vle32.v v8, (a0)
+; ZVKB-NEXT:    vrev8.v v8, v8
+; ZVKB-NEXT:    vse32.v v8, (a0)
+; ZVKB-NEXT:    ret
   %a = load <4 x i32>, ptr %x
   %b = load <4 x i32>, ptr %y
   %c = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a)
@@ -161,13 +161,13 @@ define void @bswap_v2i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
-; ZVBB-LABEL: bswap_v2i64:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVBB-NEXT:    vle64.v v8, (a0)
-; ZVBB-NEXT:    vrev8.v v8, v8
-; ZVBB-NEXT:    vse64.v v8, (a0)
-; ZVBB-NEXT:    ret
+; ZVKB-LABEL: bswap_v2i64:
+; ZVKB:       # %bb.0:
+; ZVKB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVKB-NEXT:    vle64.v v8, (a0)
+; ZVKB-NEXT:    vrev8.v v8, v8
+; ZVKB-NEXT:    vse64.v v8, (a0)
+; ZVKB-NEXT:    ret
   %a = load <2 x i64>, ptr %x
   %b = load <2 x i64>, ptr %y
   %c = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a)
@@ -229,13 +229,13 @@ define void @bswap_v16i16(ptr %x, ptr %y) {
 ; LMULMAX1-RV64-NEXT:    vse16.v v8, (a1)
 ; LMULMAX1-RV64-NEXT:    ret
 ;
-; ZVBB-LABEL: bswap_v16i16:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vrev8.v v8, v8
-; ZVBB-NEXT:    vse16.v v8, (a0)
-; ZVBB-NEXT:    ret
+; ZVKB-LABEL: bswap_v16i16:
+; ZVKB:       # %bb.0:
+; ZVKB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVKB-NEXT:    vle16.v v8, (a0)
+; ZVKB-NEXT:    vrev8.v v8, v8
+; ZVKB-NEXT:    vse16.v v8, (a0)
+; ZVKB-NEXT:    ret
   %a = load <16 x i16>, ptr %x
   %b = load <16 x i16>, ptr %y
   %c = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a)
@@ -341,13 +341,13 @@ define void @bswap_v8i32(ptr %x, ptr %y) {
 ; LMULMAX1-RV64-NEXT:    vse32.v v8, (a1)
 ; LMULMAX1-RV64-NEXT:    ret
 ;
-; ZVBB-LABEL: bswap_v8i32:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vrev8.v v8, v8
-; ZVBB-NEXT:    vse32.v v8, (a0)
-; ZVBB-NEXT:    ret
+; ZVKB-LABEL: bswap_v8i32:
+; ZVKB:       # %bb.0:
+; ZVKB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; ZVKB-NEXT:    vle32.v v8, (a0)
+; ZVKB-NEXT:    vrev8.v v8, v8
+; ZVKB-NEXT:    vse32.v v8, (a0)
+; ZVKB-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %b = load <8 x i32>, ptr %y
   %c = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a)
@@ -557,13 +557,13 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ; LMULMAX1-RV64-NEXT:    vse64.v v8, (a1)
 ; LMULMAX1-RV64-NEXT:    ret
 ;
-; ZVBB-LABEL: bswap_v4i64:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; ZVBB-NEXT:    vle64.v v8, (a0)
-; ZVBB-NEXT:    vrev8.v v8, v8
-; ZVBB-NEXT:    vse64.v v8, (a0)
-; ZVBB-NEXT:    ret
+; ZVKB-LABEL: bswap_v4i64:
+; ZVKB:       # %bb.0:
+; ZVKB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; ZVKB-NEXT:    vle64.v v8, (a0)
+; ZVKB-NEXT:    vrev8.v v8, v8
+; ZVKB-NEXT:    vse64.v v8, (a0)
+; ZVKB-NEXT:    ret
   %a = load <4 x i64>, ptr %x
   %b = load <4 x i64>, ptr %y
   %c = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 7593c1ab75ce42..d1ea56a1ff9381 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1083,3 +1083,132 @@ define <2 x float> @signbits() {
 entry:
   ret <2 x float> <float 0x36A0000000000000, float 0.000000e+00>
 }
+
+define <2 x half> @vid_v2f16() {
+; CHECK-LABEL: vid_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x half> <half 0.0, half 1.0>
+}
+
+define <2 x half> @vid_addend1_v2f16() {
+; CHECK-LABEL: vid_addend1_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x half> <half 1.0, half 2.0>
+}
+
+define <2 x half> @vid_denominator2_v2f16() {
+; CHECK-LABEL: vid_denominator2_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI27_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI27_0)
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    ret
+  ret <2 x half> <half 0.5, half 1.0>
+}
+
+define <2 x half> @vid_step2_v2f16() {
+; CHECK-LABEL: vid_step2_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x half> <half 0.0, half 2.0>
+}
+
+define <2 x float> @vid_v2f32() {
+; CHECK-LABEL: vid_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x float> <float 0.0, float 1.0>
+}
+
+define <2 x float> @vid_addend1_v2f32() {
+; CHECK-LABEL: vid_addend1_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x float> <float 1.0, float 2.0>
+}
+
+define <2 x float> @vid_denominator2_v2f32() {
+; CHECK-LABEL: vid_denominator2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI31_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI31_0)
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    ret
+  ret <2 x float> <float 0.5, float 1.0>
+}
+
+define <2 x float> @vid_step2_v2f32() {
+; CHECK-LABEL: vid_step2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x float> <float 0.0, float 2.0>
+}
+
+define <2 x double> @vid_v2f64() {
+; CHECK-LABEL: vid_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x double> <double 0.0, double 1.0>
+}
+
+define <2 x double> @vid_addend1_v2f64() {
+; CHECK-LABEL: vid_addend1_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x double> <double 1.0, double 2.0>
+}
+
+define <2 x double> @vid_denominator2_v2f64() {
+; CHECK-LABEL: vid_denominator2_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI35_0)
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    ret
+  ret <2 x double> <double 0.5, double 1.0>
+}
+
+define <2 x double> @vid_step2_v2f64() {
+; CHECK-LABEL: vid_step2_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
+; CHECK-NEXT:    ret
+  ret <2 x double> <double 0.0, double 2.0>
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index e95978744c408e..b648420aa2e03c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -259,7 +259,7 @@ define <4 x i8> @buildvec_vid_stepn3_add3_v4i8() {
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 3
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    li a0, -3
+; CHECK-NEXT:    li a0, 253
 ; CHECK-NEXT:    vmadd.vx v8, a0, v9
 ; CHECK-NEXT:    ret
   ret <4 x i8> <i8 3, i8 0, i8 -3, i8 -6>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index d22505eac04788..f3570495600f3c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -5,15 +5,14 @@
 define i8 @explode_2xi8(<2 x i8> %v) {
 ; CHECK-LABEL: explode_2xi8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    ret
   %e0 = extractelement <2 x i8> %v, i32 0
   %e1 = extractelement <2 x i8> %v, i32 1
-  %add0 = add i8 %e0, %e1
+  %add0 = xor i8 %e0, %e1
   ret i8 %add0
 }
 
@@ -21,23 +20,23 @@ define i8 @explode_4xi8(<4 x i8> %v) {
 ; CHECK-LABEL: explode_4xi8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a2, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 3
-; CHECK-NEXT:    vmv.x.s a3, v8
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 3
+; CHECK-NEXT:    vmv.x.s a1, v9
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a2, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    xor a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    add a0, a2, a0
 ; CHECK-NEXT:    ret
   %e0 = extractelement <4 x i8> %v, i32 0
   %e1 = extractelement <4 x i8> %v, i32 1
   %e2 = extractelement <4 x i8> %v, i32 2
   %e3 = extractelement <4 x i8> %v, i32 3
-  %add0 = add i8 %e0, %e1
-  %add1 = xor i8 %add0, %e2
+  %add0 = xor i8 %e0, %e1
+  %add1 = add i8 %add0, %e2
   %add2 = add i8 %add1, %e3
   ret i8 %add2
 }
@@ -47,28 +46,28 @@ define i8 @explode_8xi8(<8 x i8> %v) {
 ; CHECK-LABEL: explode_8xi8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a2, v9
+; CHECK-NEXT:    vmv.x.s a0, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 3
-; CHECK-NEXT:    vmv.x.s a3, v9
+; CHECK-NEXT:    vmv.x.s a1, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 4
-; CHECK-NEXT:    vmv.x.s a4, v9
+; CHECK-NEXT:    vmv.x.s a2, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 5
-; CHECK-NEXT:    vmv.x.s a5, v9
+; CHECK-NEXT:    vmv.x.s a3, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 6
-; CHECK-NEXT:    vmv.x.s a6, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 7
-; CHECK-NEXT:    vmv.x.s a7, v8
+; CHECK-NEXT:    vmv.x.s a4, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 7
+; CHECK-NEXT:    vmv.x.s a5, v9
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a6, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    xor a0, a0, a2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, a3, a5
-; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    add a6, a6, a7
-; CHECK-NEXT:    add a0, a0, a6
+; CHECK-NEXT:    add a0, a6, a0
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a5
 ; CHECK-NEXT:    ret
   %e0 = extractelement <8 x i8> %v, i32 0
   %e1 = extractelement <8 x i8> %v, i32 1
@@ -78,8 +77,8 @@ define i8 @explode_8xi8(<8 x i8> %v) {
   %e5 = extractelement <8 x i8> %v, i32 5
   %e6 = extractelement <8 x i8> %v, i32 6
   %e7 = extractelement <8 x i8> %v, i32 7
-  %add0 = add i8 %e0, %e1
-  %add1 = xor i8 %add0, %e2
+  %add0 = xor i8 %e0, %e1
+  %add1 = add i8 %add0, %e2
   %add2 = add i8 %add1, %e3
   %add3 = add i8 %add2, %e4
   %add4 = add i8 %add3, %e5
@@ -89,119 +88,56 @@ define i8 @explode_8xi8(<8 x i8> %v) {
 }
 
 define i8 @explode_16xi8(<16 x i8> %v) {
-; RV32-LABEL: explode_16xi8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 8
-; RV32-NEXT:    vmv.x.s t0, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 9
-; RV32-NEXT:    vmv.x.s t1, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 10
-; RV32-NEXT:    vmv.x.s t2, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 11
-; RV32-NEXT:    vmv.x.s t3, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 12
-; RV32-NEXT:    vmv.x.s t4, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 13
-; RV32-NEXT:    vmv.x.s t5, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 14
-; RV32-NEXT:    vmv.x.s t6, v9
-; RV32-NEXT:    vslidedown.vi v8, v8, 15
-; RV32-NEXT:    vmv.x.s s0, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    xor a0, a0, a2
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    add a3, a3, a5
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, a6, t0
-; RV32-NEXT:    add a6, a6, t1
-; RV32-NEXT:    add a0, a0, a6
-; RV32-NEXT:    add t2, t2, t3
-; RV32-NEXT:    add t2, t2, t4
-; RV32-NEXT:    add t2, t2, t5
-; RV32-NEXT:    add t2, t2, t6
-; RV32-NEXT:    add a0, a0, t2
-; RV32-NEXT:    add a0, a0, s0
-; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: explode_16xi8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset s0, -8
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 8
-; RV64-NEXT:    vmv.x.s t0, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 9
-; RV64-NEXT:    vmv.x.s t1, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 10
-; RV64-NEXT:    vmv.x.s t2, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 11
-; RV64-NEXT:    vmv.x.s t3, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 12
-; RV64-NEXT:    vmv.x.s t4, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 13
-; RV64-NEXT:    vmv.x.s t5, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 14
-; RV64-NEXT:    vmv.x.s t6, v9
-; RV64-NEXT:    vslidedown.vi v8, v8, 15
-; RV64-NEXT:    vmv.x.s s0, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    add a0, a0, a3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, a6, t0
-; RV64-NEXT:    add a6, a6, t1
-; RV64-NEXT:    add a0, a0, a6
-; RV64-NEXT:    add t2, t2, t3
-; RV64-NEXT:    add t2, t2, t4
-; RV64-NEXT:    add t2, t2, t5
-; RV64-NEXT:    add t2, t2, t6
-; RV64-NEXT:    add a0, a0, t2
-; RV64-NEXT:    add a0, a0, s0
-; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    ret
+; CHECK-LABEL: explode_16xi8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 3
+; CHECK-NEXT:    vmv.x.s a1, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 4
+; CHECK-NEXT:    vmv.x.s a2, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 5
+; CHECK-NEXT:    vmv.x.s a3, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 6
+; CHECK-NEXT:    vmv.x.s a4, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 7
+; CHECK-NEXT:    vmv.x.s a5, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 8
+; CHECK-NEXT:    vmv.x.s a6, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 9
+; CHECK-NEXT:    vmv.x.s a7, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 10
+; CHECK-NEXT:    vmv.x.s t0, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 11
+; CHECK-NEXT:    vmv.x.s t1, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 12
+; CHECK-NEXT:    vmv.x.s t2, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 13
+; CHECK-NEXT:    vmv.x.s t3, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 14
+; CHECK-NEXT:    vmv.x.s t4, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 15
+; CHECK-NEXT:    vmv.x.s t5, v9
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s t6, v8
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, t6, a0
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a5, a5, a6
+; CHECK-NEXT:    add a5, a5, a7
+; CHECK-NEXT:    add a5, a5, t0
+; CHECK-NEXT:    add a0, a0, a5
+; CHECK-NEXT:    add t1, t1, t2
+; CHECK-NEXT:    add t1, t1, t3
+; CHECK-NEXT:    add t1, t1, t4
+; CHECK-NEXT:    add t1, t1, t5
+; CHECK-NEXT:    add a0, a0, t1
+; CHECK-NEXT:    ret
   %e0 = extractelement <16 x i8> %v, i32 0
   %e1 = extractelement <16 x i8> %v, i32 1
   %e2 = extractelement <16 x i8> %v, i32 2
@@ -218,8 +154,8 @@ define i8 @explode_16xi8(<16 x i8> %v) {
   %e13 = extractelement <16 x i8> %v, i32 13
   %e14 = extractelement <16 x i8> %v, i32 14
   %e15 = extractelement <16 x i8> %v, i32 15
-  %add0 = add i8 %e0, %e1
-  %add1 = xor i8 %add0, %e2
+  %add0 = xor i8 %e0, %e1
+  %add1 = add i8 %add0, %e2
   %add2 = add i8 %add1, %e3
   %add3 = add i8 %add2, %e4
   %add4 = add i8 %add3, %e5
@@ -239,15 +175,14 @@ define i8 @explode_16xi8(<16 x i8> %v) {
 define i16 @explode_2xi16(<2 x i16> %v) {
 ; CHECK-LABEL: explode_2xi16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v8
-; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    ret
   %e0 = extractelement <2 x i16> %v, i32 0
   %e1 = extractelement <2 x i16> %v, i32 1
-  %add0 = add i16 %e0, %e1
+  %add0 = xor i16 %e0, %e1
   ret i16 %add0
 }
 
@@ -255,23 +190,23 @@ define i16 @explode_4xi16(<4 x i16> %v) {
 ; CHECK-LABEL: explode_4xi16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a2, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 3
-; CHECK-NEXT:    vmv.x.s a3, v8
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 3
+; CHECK-NEXT:    vmv.x.s a1, v9
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a2, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    xor a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    add a0, a2, a0
 ; CHECK-NEXT:    ret
   %e0 = extractelement <4 x i16> %v, i32 0
   %e1 = extractelement <4 x i16> %v, i32 1
   %e2 = extractelement <4 x i16> %v, i32 2
   %e3 = extractelement <4 x i16> %v, i32 3
-  %add0 = add i16 %e0, %e1
-  %add1 = xor i16 %add0, %e2
+  %add0 = xor i16 %e0, %e1
+  %add1 = add i16 %add0, %e2
   %add2 = add i16 %add1, %e3
   ret i16 %add2
 }
@@ -281,28 +216,28 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 ; CHECK-LABEL: explode_8xi16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vmv.x.s a1, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a2, v9
+; CHECK-NEXT:    vmv.x.s a0, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 3
-; CHECK-NEXT:    vmv.x.s a3, v9
+; CHECK-NEXT:    vmv.x.s a1, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 4
-; CHECK-NEXT:    vmv.x.s a4, v9
+; CHECK-NEXT:    vmv.x.s a2, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 5
-; CHECK-NEXT:    vmv.x.s a5, v9
+; CHECK-NEXT:    vmv.x.s a3, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 6
-; CHECK-NEXT:    vmv.x.s a6, v9
-; CHECK-NEXT:    vslidedown.vi v8, v8, 7
-; CHECK-NEXT:    vmv.x.s a7, v8
+; CHECK-NEXT:    vmv.x.s a4, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 7
+; CHECK-NEXT:    vmv.x.s a5, v9
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a6, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    xor a0, a0, a2
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, a3, a5
-; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    add a6, a6, a7
-; CHECK-NEXT:    add a0, a0, a6
+; CHECK-NEXT:    add a0, a6, a0
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a5
 ; CHECK-NEXT:    ret
   %e0 = extractelement <8 x i16> %v, i32 0
   %e1 = extractelement <8 x i16> %v, i32 1
@@ -312,8 +247,8 @@ define i16 @explode_8xi16(<8 x i16> %v) {
   %e5 = extractelement <8 x i16> %v, i32 5
   %e6 = extractelement <8 x i16> %v, i32 6
   %e7 = extractelement <8 x i16> %v, i32 7
-  %add0 = add i16 %e0, %e1
-  %add1 = xor i16 %add0, %e2
+  %add0 = xor i16 %e0, %e1
+  %add1 = add i16 %add0, %e2
   %add2 = add i16 %add1, %e3
   %add3 = add i16 %add2, %e4
   %add4 = add i16 %add3, %e5
@@ -323,121 +258,57 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 }
 
 define i16 @explode_16xi16(<16 x i16> %v) {
-; RV32-LABEL: explode_16xi16:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v10
-; RV32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 8
-; RV32-NEXT:    vmv.x.s t0, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 9
-; RV32-NEXT:    vmv.x.s t1, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 10
-; RV32-NEXT:    vmv.x.s t2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 11
-; RV32-NEXT:    vmv.x.s t3, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 12
-; RV32-NEXT:    vmv.x.s t4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 13
-; RV32-NEXT:    vmv.x.s t5, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 14
-; RV32-NEXT:    vmv.x.s t6, v10
-; RV32-NEXT:    vslidedown.vi v8, v8, 15
-; RV32-NEXT:    vmv.x.s s0, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    xor a0, a0, a2
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    add a3, a3, a5
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, a6, t0
-; RV32-NEXT:    add a6, a6, t1
-; RV32-NEXT:    add a0, a0, a6
-; RV32-NEXT:    add t2, t2, t3
-; RV32-NEXT:    add t2, t2, t4
-; RV32-NEXT:    add t2, t2, t5
-; RV32-NEXT:    add t2, t2, t6
-; RV32-NEXT:    add a0, a0, t2
-; RV32-NEXT:    add a0, a0, s0
-; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: explode_16xi16:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset s0, -8
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v10
-; RV64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 8
-; RV64-NEXT:    vmv.x.s t0, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 9
-; RV64-NEXT:    vmv.x.s t1, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 10
-; RV64-NEXT:    vmv.x.s t2, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 11
-; RV64-NEXT:    vmv.x.s t3, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 12
-; RV64-NEXT:    vmv.x.s t4, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 13
-; RV64-NEXT:    vmv.x.s t5, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 14
-; RV64-NEXT:    vmv.x.s t6, v10
-; RV64-NEXT:    vslidedown.vi v8, v8, 15
-; RV64-NEXT:    vmv.x.s s0, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    add a0, a0, a3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, a6, t0
-; RV64-NEXT:    add a6, a6, t1
-; RV64-NEXT:    add a0, a0, a6
-; RV64-NEXT:    add t2, t2, t3
-; RV64-NEXT:    add t2, t2, t4
-; RV64-NEXT:    add t2, t2, t5
-; RV64-NEXT:    add t2, t2, t6
-; RV64-NEXT:    add a0, a0, t2
-; RV64-NEXT:    add a0, a0, s0
-; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    ret
+; CHECK-LABEL: explode_16xi16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vmv.x.s a2, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 5
+; CHECK-NEXT:    vmv.x.s a3, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 6
+; CHECK-NEXT:    vmv.x.s a4, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 7
+; CHECK-NEXT:    vmv.x.s a5, v10
+; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vmv.x.s a6, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 9
+; CHECK-NEXT:    vmv.x.s a7, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 10
+; CHECK-NEXT:    vmv.x.s t0, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 11
+; CHECK-NEXT:    vmv.x.s t1, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 12
+; CHECK-NEXT:    vmv.x.s t2, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 13
+; CHECK-NEXT:    vmv.x.s t3, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 14
+; CHECK-NEXT:    vmv.x.s t4, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 15
+; CHECK-NEXT:    vmv.x.s t5, v10
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s t6, v8
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, t6, a0
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a5, a5, a6
+; CHECK-NEXT:    add a5, a5, a7
+; CHECK-NEXT:    add a5, a5, t0
+; CHECK-NEXT:    add a0, a0, a5
+; CHECK-NEXT:    add t1, t1, t2
+; CHECK-NEXT:    add t1, t1, t3
+; CHECK-NEXT:    add t1, t1, t4
+; CHECK-NEXT:    add t1, t1, t5
+; CHECK-NEXT:    add a0, a0, t1
+; CHECK-NEXT:    ret
   %e0 = extractelement <16 x i16> %v, i32 0
   %e1 = extractelement <16 x i16> %v, i32 1
   %e2 = extractelement <16 x i16> %v, i32 2
@@ -454,8 +325,8 @@ define i16 @explode_16xi16(<16 x i16> %v) {
   %e13 = extractelement <16 x i16> %v, i32 13
   %e14 = extractelement <16 x i16> %v, i32 14
   %e15 = extractelement <16 x i16> %v, i32 15
-  %add0 = add i16 %e0, %e1
-  %add1 = xor i16 %add0, %e2
+  %add0 = xor i16 %e0, %e1
+  %add1 = add i16 %add0, %e2
   %add2 = add i16 %add1, %e3
   %add3 = add i16 %add2, %e4
   %add4 = add i16 %add3, %e5
@@ -473,26 +344,16 @@ define i16 @explode_16xi16(<16 x i16> %v) {
 }
 
 define i32 @explode_2xi32(<2 x i32> %v) {
-; RV32-LABEL: explode_2xi32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: explode_2xi32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    addw a0, a0, a1
-; RV64-NEXT:    ret
+; CHECK-LABEL: explode_2xi32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %e0 = extractelement <2 x i32> %v, i32 0
   %e1 = extractelement <2 x i32> %v, i32 1
-  %add0 = add i32 %e0, %e1
+  %add0 = xor i32 %e0, %e1
   ret i32 %add0
 }
 
@@ -500,38 +361,38 @@ define i32 @explode_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: explode_4xi32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v9
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v8
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 3
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vmv.x.s a2, v8
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    xor a0, a0, a2
-; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a0, a2, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_4xi32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v9
 ; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v9
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v8
+; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 3
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v9
+; RV64-NEXT:    vmv.x.s a2, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    addw a0, a0, a3
+; RV64-NEXT:    addw a0, a2, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
   %e2 = extractelement <4 x i32> %v, i32 2
   %e3 = extractelement <4 x i32> %v, i32 3
-  %add0 = add i32 %e0, %e1
-  %add1 = xor i32 %add0, %e2
+  %add0 = xor i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
   %add2 = add i32 %add1, %e3
   ret i32 %add2
 }
@@ -541,57 +402,57 @@ define i32 @explode_8xi32(<8 x i32> %v) {
 ; RV32-LABEL: explode_8xi32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vmv.x.s a2, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vmv.x.s a3, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v10
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v8
+; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 7
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vmv.x.s a6, v8
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    xor a0, a0, a2
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    add a3, a3, a5
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a0, a0, a6
+; RV32-NEXT:    add a0, a6, a0
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a0, a0, a5
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_8xi32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vmv.x.s a2, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vmv.x.s a3, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v10
-; RV64-NEXT:    vslidedown.vi v8, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v8
+; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v9
+; RV64-NEXT:    vmv.x.s a6, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    add a0, a0, a3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    addw a0, a0, a6
+; RV64-NEXT:    add a0, a6, a0
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    addw a0, a0, a5
 ; RV64-NEXT:    ret
   %e0 = extractelement <8 x i32> %v, i32 0
   %e1 = extractelement <8 x i32> %v, i32 1
@@ -601,8 +462,8 @@ define i32 @explode_8xi32(<8 x i32> %v) {
   %e5 = extractelement <8 x i32> %v, i32 5
   %e6 = extractelement <8 x i32> %v, i32 6
   %e7 = extractelement <8 x i32> %v, i32 7
-  %add0 = add i32 %e0, %e1
-  %add1 = xor i32 %add0, %e2
+  %add0 = xor i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
   %add2 = add i32 %add1, %e3
   %add3 = add i32 %add2, %e4
   %add4 = add i32 %add3, %e5
@@ -618,60 +479,57 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 128
 ; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 116(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s2, -12
 ; RV32-NEXT:    addi s0, sp, 128
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vmv.x.s a0, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vmv.x.s a2, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vmv.x.s a3, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v12
+; RV32-NEXT:    vmv.x.s a4, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v12
-; RV32-NEXT:    mv t0, sp
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    mv a6, sp
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (t0)
-; RV32-NEXT:    lw t0, 32(sp)
-; RV32-NEXT:    lw t1, 36(sp)
-; RV32-NEXT:    lw t2, 40(sp)
-; RV32-NEXT:    lw t3, 44(sp)
-; RV32-NEXT:    lw t4, 48(sp)
-; RV32-NEXT:    lw t5, 52(sp)
-; RV32-NEXT:    lw t6, 56(sp)
-; RV32-NEXT:    lw s2, 60(sp)
+; RV32-NEXT:    vse32.v v8, (a6)
+; RV32-NEXT:    lw a6, 32(sp)
+; RV32-NEXT:    lw a7, 36(sp)
+; RV32-NEXT:    lw t0, 40(sp)
+; RV32-NEXT:    lw t1, 44(sp)
+; RV32-NEXT:    lw t2, 48(sp)
+; RV32-NEXT:    lw t3, 52(sp)
+; RV32-NEXT:    lw t4, 56(sp)
+; RV32-NEXT:    lw t5, 60(sp)
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vmv.x.s t6, v8
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    xor a0, a0, a2
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    add a3, a3, a5
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    add a6, a6, t0
-; RV32-NEXT:    add a0, a0, a6
-; RV32-NEXT:    add t1, t1, t2
-; RV32-NEXT:    add t1, t1, t3
-; RV32-NEXT:    add a0, a0, t1
-; RV32-NEXT:    add t4, t4, t5
-; RV32-NEXT:    add t4, t4, t6
-; RV32-NEXT:    add t4, t4, s2
-; RV32-NEXT:    add a0, a0, t4
+; RV32-NEXT:    add a0, t6, a0
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a7, a7, t1
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t2, t2, t3
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    add t2, t2, t5
+; RV32-NEXT:    add a0, a0, t2
 ; RV32-NEXT:    addi sp, s0, -128
 ; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 116(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 128
 ; RV32-NEXT:    ret
 ;
@@ -681,60 +539,57 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV64-NEXT:    .cfi_def_cfa_offset 128
 ; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 104(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s2, -24
 ; RV64-NEXT:    addi s0, sp, 128
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vmv.x.s a0, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vmv.x.s a2, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v12
+; RV64-NEXT:    vmv.x.s a3, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v12
+; RV64-NEXT:    vmv.x.s a4, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v12
-; RV64-NEXT:    mv t0, sp
+; RV64-NEXT:    vmv.x.s a5, v12
+; RV64-NEXT:    mv a6, sp
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (t0)
-; RV64-NEXT:    lw t0, 32(sp)
-; RV64-NEXT:    lw t1, 36(sp)
-; RV64-NEXT:    lw t2, 40(sp)
-; RV64-NEXT:    lw t3, 44(sp)
-; RV64-NEXT:    lw t4, 48(sp)
-; RV64-NEXT:    lw t5, 52(sp)
-; RV64-NEXT:    lw t6, 56(sp)
-; RV64-NEXT:    lw s2, 60(sp)
+; RV64-NEXT:    vse32.v v8, (a6)
+; RV64-NEXT:    lw a6, 32(sp)
+; RV64-NEXT:    lw a7, 36(sp)
+; RV64-NEXT:    lw t0, 40(sp)
+; RV64-NEXT:    lw t1, 44(sp)
+; RV64-NEXT:    lw t2, 48(sp)
+; RV64-NEXT:    lw t3, 52(sp)
+; RV64-NEXT:    lw t4, 56(sp)
+; RV64-NEXT:    lw t5, 60(sp)
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v9
+; RV64-NEXT:    vmv.x.s t6, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    add a0, a0, a3
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, a6, t0
-; RV64-NEXT:    add a0, a0, a6
-; RV64-NEXT:    add t1, t1, t2
-; RV64-NEXT:    add t1, t1, t3
-; RV64-NEXT:    add a0, a0, t1
-; RV64-NEXT:    add t4, t4, t5
-; RV64-NEXT:    add t4, t4, t6
-; RV64-NEXT:    add t4, t4, s2
-; RV64-NEXT:    addw a0, a0, t4
+; RV64-NEXT:    add a0, t6, a0
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a0, a0, a5
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    add a7, a7, t1
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add t2, t2, t3
+; RV64-NEXT:    add t2, t2, t4
+; RV64-NEXT:    add t2, t2, t5
+; RV64-NEXT:    addw a0, a0, t2
 ; RV64-NEXT:    addi sp, s0, -128
 ; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 104(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 128
 ; RV64-NEXT:    ret
   %e0 = extractelement <16 x i32> %v, i32 0
@@ -753,8 +608,8 @@ define i32 @explode_16xi32(<16 x i32> %v) {
   %e13 = extractelement <16 x i32> %v, i32 13
   %e14 = extractelement <16 x i32> %v, i32 14
   %e15 = extractelement <16 x i32> %v, i32 15
-  %add0 = add i32 %e0, %e1
-  %add1 = xor i32 %add0, %e2
+  %add0 = xor i32 %e0, %e1
+  %add1 = add i32 %add0, %e2
   %add2 = add i32 %add1, %e3
   %add3 = add i32 %add2, %e4
   %add4 = add i32 %add3, %e5
@@ -774,88 +629,79 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 define i64 @explode_2xi64(<2 x i64> %v) {
 ; RV32-LABEL: explode_2xi64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    vmv.x.s a3, v8
-; RV32-NEXT:    add a1, a1, a0
-; RV32-NEXT:    add a0, a2, a3
-; RV32-NEXT:    sltu a2, a0, a2
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_2xi64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    ret
   %e0 = extractelement <2 x i64> %v, i32 0
   %e1 = extractelement <2 x i64> %v, i32 1
-  %add0 = add i64 %e0, %e1
+  %add0 = xor i64 %e0, %e1
   ret i64 %add0
 }
 
 define i64 @explode_4xi64(<4 x i64> %v) {
 ; RV32-LABEL: explode_4xi64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsrl.vx v12, v10, a0
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32-NEXT:    vsrl.vx v12, v10, a0
 ; RV32-NEXT:    vmv.x.s a3, v12
 ; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vsrl.vx v12, v10, a0
-; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    vmv.x.s a6, v10
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    vmv.x.s a7, v10
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    add a4, a2, a4
-; RV32-NEXT:    sltu a2, a4, a2
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    xor a1, a1, a5
-; RV32-NEXT:    xor a2, a4, a6
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vmv.x.s a5, v8
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    sltu a5, a2, a5
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a1, a0, a3
+; RV32-NEXT:    add a0, a2, a4
 ; RV32-NEXT:    sltu a2, a0, a2
-; RV32-NEXT:    add a1, a1, a7
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_4xi64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v10
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v8
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a1, v10
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v9
+; RV64-NEXT:    vmv.x.s a2, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a0, a2, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i64> %v, i32 0
   %e1 = extractelement <4 x i64> %v, i32 1
   %e2 = extractelement <4 x i64> %v, i32 2
   %e3 = extractelement <4 x i64> %v, i32 3
-  %add0 = add i64 %e0, %e1
-  %add1 = xor i64 %add0, %e2
+  %add0 = xor i64 %e0, %e1
+  %add1 = add i64 %add0, %e2
   %add2 = add i64 %add1, %e3
   ret i64 %add2
 }
@@ -864,71 +710,63 @@ define i64 @explode_4xi64(<4 x i64> %v) {
 define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV32-LABEL: explode_8xi64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsrl.vx v16, v12, a0
+; RV32-NEXT:    vmv.x.s a1, v16
+; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s a3, v16
 ; RV32-NEXT:    vmv.x.s a4, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vslidedown.vi v12, v8, 4
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s a5, v16
 ; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    vslidedown.vi v12, v8, 5
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s a7, v16
 ; RV32-NEXT:    vmv.x.s t0, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vslidedown.vi v12, v8, 6
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s t1, v16
 ; RV32-NEXT:    vmv.x.s t2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
+; RV32-NEXT:    vslidedown.vi v12, v8, 7
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s t3, v16
 ; RV32-NEXT:    vmv.x.s t4, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s t5, v16
-; RV32-NEXT:    vmv.x.s t6, v12
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vmv.x.s s0, v8
-; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vmv.x.s t5, v8
+; RV32-NEXT:    add a2, t5, a2
+; RV32-NEXT:    sltu t5, a2, t5
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a0, t5
+; RV32-NEXT:    add a0, a0, a3
 ; RV32-NEXT:    add a4, a2, a4
-; RV32-NEXT:    sltu a2, a4, a2
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    xor a1, a1, a5
-; RV32-NEXT:    xor a2, a4, a6
-; RV32-NEXT:    add t0, a2, t0
-; RV32-NEXT:    sltu a2, t0, a2
+; RV32-NEXT:    sltu a1, a4, a2
+; RV32-NEXT:    add a1, a1, a5
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a6, a4, a6
+; RV32-NEXT:    sltu a1, a6, a4
 ; RV32-NEXT:    add a1, a1, a7
-; RV32-NEXT:    add a2, a2, t1
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add t0, a6, t0
+; RV32-NEXT:    sltu a1, t0, a6
+; RV32-NEXT:    add a1, a1, t1
+; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add t2, t0, t2
-; RV32-NEXT:    sltu a2, t2, t0
-; RV32-NEXT:    add a2, a2, t3
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    add t4, t2, t4
-; RV32-NEXT:    sltu a2, t4, t2
-; RV32-NEXT:    add a2, a2, t5
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    add t6, t4, t6
-; RV32-NEXT:    sltu a2, t6, t4
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    add a1, a1, a0
-; RV32-NEXT:    add a0, t6, s0
-; RV32-NEXT:    sltu a2, a0, t6
+; RV32-NEXT:    sltu a1, t2, t0
+; RV32-NEXT:    add a1, a1, t3
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    add a0, t2, t4
+; RV32-NEXT:    sltu a2, a0, t2
 ; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_8xi64:
@@ -942,29 +780,28 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV64-NEXT:    addi s0, sp, 128
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vmv.x.s a0, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v12
-; RV64-NEXT:    mv a4, sp
+; RV64-NEXT:    vmv.x.s a1, v12
+; RV64-NEXT:    mv a2, sp
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a4)
-; RV64-NEXT:    ld a4, 32(sp)
-; RV64-NEXT:    ld a5, 40(sp)
-; RV64-NEXT:    ld a6, 48(sp)
-; RV64-NEXT:    ld a7, 56(sp)
+; RV64-NEXT:    vse64.v v8, (a2)
+; RV64-NEXT:    ld a2, 32(sp)
+; RV64-NEXT:    ld a3, 40(sp)
+; RV64-NEXT:    ld a4, 48(sp)
+; RV64-NEXT:    ld a5, 56(sp)
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v9
+; RV64-NEXT:    vmv.x.s a6, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a0, a6, a0
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    add a0, a0, a3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a0, a0, a6
+; RV64-NEXT:    add a0, a0, a5
 ; RV64-NEXT:    addi sp, s0, -128
 ; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
@@ -978,8 +815,8 @@ define i64 @explode_8xi64(<8 x i64> %v) {
   %e5 = extractelement <8 x i64> %v, i32 5
   %e6 = extractelement <8 x i64> %v, i32 6
   %e7 = extractelement <8 x i64> %v, i32 7
-  %add0 = add i64 %e0, %e1
-  %add1 = xor i64 %add0, %e2
+  %add0 = xor i64 %e0, %e1
+  %add1 = add i64 %add0, %e2
   %add2 = add i64 %add1, %e3
   %add3 = add i64 %add2, %e4
   %add4 = add i64 %add3, %e5
@@ -1019,130 +856,130 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    .cfi_offset s9, -44
 ; RV32-NEXT:    .cfi_offset s10, -48
 ; RV32-NEXT:    .cfi_offset s11, -52
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vmv.x.s a0, v16
-; RV32-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v16, v8, 1
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s a3, v24
-; RV32-NEXT:    vmv.x.s a4, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 2
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s a5, v24
-; RV32-NEXT:    vmv.x.s a6, v16
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s t6, v24
+; RV32-NEXT:    vmv.x.s a1, v16
+; RV32-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    vslidedown.vi v16, v8, 3
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s a7, v24
-; RV32-NEXT:    vmv.x.s t0, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s a1, v24
+; RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    vmv.x.s a2, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 4
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s3, v24
-; RV32-NEXT:    vmv.x.s t1, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s0, v24
+; RV32-NEXT:    vmv.x.s a3, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 5
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s4, v24
-; RV32-NEXT:    vmv.x.s t2, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s1, v24
+; RV32-NEXT:    vmv.x.s a4, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 6
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s5, v24
-; RV32-NEXT:    vmv.x.s t3, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s2, v24
+; RV32-NEXT:    vmv.x.s a5, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 7
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s6, v24
-; RV32-NEXT:    vmv.x.s t4, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s3, v24
+; RV32-NEXT:    vmv.x.s a6, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 8
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s7, v24
-; RV32-NEXT:    vmv.x.s t5, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s4, v24
+; RV32-NEXT:    vmv.x.s a7, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 9
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s8, v24
-; RV32-NEXT:    vmv.x.s t6, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s5, v24
+; RV32-NEXT:    vmv.x.s t0, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 10
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s9, v24
-; RV32-NEXT:    vmv.x.s s0, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s6, v24
+; RV32-NEXT:    vmv.x.s t1, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 11
-; RV32-NEXT:    vsrl.vx v24, v16, a1
-; RV32-NEXT:    vmv.x.s s10, v24
-; RV32-NEXT:    vmv.x.s s1, v16
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s7, v24
+; RV32-NEXT:    vmv.x.s t2, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 12
-; RV32-NEXT:    vsrl.vx v24, v16, a1
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s8, v24
+; RV32-NEXT:    vmv.x.s t3, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 13
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s9, v24
+; RV32-NEXT:    vmv.x.s t4, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 14
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s s10, v24
+; RV32-NEXT:    vmv.x.s t5, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 15
+; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s11, v24
-; RV32-NEXT:    vmv.x.s s2, v16
-; RV32-NEXT:    vslidedown.vi v24, v8, 13
-; RV32-NEXT:    vsrl.vx v16, v24, a1
+; RV32-NEXT:    vmv.s.x v9, zero
 ; RV32-NEXT:    vmv.x.s ra, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 14
-; RV32-NEXT:    vsrl.vx v0, v16, a1
-; RV32-NEXT:    vslidedown.vi v8, v8, 15
-; RV32-NEXT:    vmv.x.s a2, v24
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    add a3, a1, a3
-; RV32-NEXT:    add a4, a0, a4
-; RV32-NEXT:    sltu a0, a4, a0
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    xor a0, a0, a5
-; RV32-NEXT:    xor a1, a4, a6
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    add t0, a1, t0
-; RV32-NEXT:    sltu a1, t0, a1
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    add a1, a0, t6
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    lw t6, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add t6, a0, t6
+; RV32-NEXT:    sltu a0, t6, a0
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a2, t6, a2
+; RV32-NEXT:    sltu a1, a2, t6
+; RV32-NEXT:    add a1, a1, s0
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a3, a2, a3
+; RV32-NEXT:    sltu a1, a3, a2
+; RV32-NEXT:    add a1, a1, s1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a4, a3, a4
+; RV32-NEXT:    sltu a1, a4, a3
+; RV32-NEXT:    add a1, a1, s2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a5, a4, a5
+; RV32-NEXT:    sltu a1, a5, a4
 ; RV32-NEXT:    add a1, a1, s3
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t1, t0, t1
-; RV32-NEXT:    sltu a1, t1, t0
+; RV32-NEXT:    add a6, a5, a6
+; RV32-NEXT:    sltu a1, a6, a5
 ; RV32-NEXT:    add a1, a1, s4
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t2, t1, t2
-; RV32-NEXT:    sltu a1, t2, t1
+; RV32-NEXT:    add a7, a6, a7
+; RV32-NEXT:    sltu a1, a7, a6
 ; RV32-NEXT:    add a1, a1, s5
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t3, t2, t3
-; RV32-NEXT:    sltu a1, t3, t2
+; RV32-NEXT:    add t0, a7, t0
+; RV32-NEXT:    sltu a1, t0, a7
 ; RV32-NEXT:    add a1, a1, s6
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t4, t3, t4
-; RV32-NEXT:    sltu a1, t4, t3
+; RV32-NEXT:    add t1, t0, t1
+; RV32-NEXT:    sltu a1, t1, t0
 ; RV32-NEXT:    add a1, a1, s7
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t5, t4, t5
-; RV32-NEXT:    sltu a1, t5, t4
+; RV32-NEXT:    add t2, t1, t2
+; RV32-NEXT:    sltu a1, t2, t1
 ; RV32-NEXT:    add a1, a1, s8
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t6, t5, t6
-; RV32-NEXT:    sltu a1, t6, t5
+; RV32-NEXT:    add t3, t2, t3
+; RV32-NEXT:    sltu a1, t3, t2
 ; RV32-NEXT:    add a1, a1, s9
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s0, t6, s0
-; RV32-NEXT:    sltu a1, s0, t6
+; RV32-NEXT:    add t4, t3, t4
+; RV32-NEXT:    sltu a1, t4, t3
 ; RV32-NEXT:    add a1, a1, s10
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s1, s0, s1
-; RV32-NEXT:    sltu a1, s1, s0
+; RV32-NEXT:    add t5, t4, t5
+; RV32-NEXT:    sltu a1, t5, t4
 ; RV32-NEXT:    add a1, a1, s11
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s2, s1, s2
-; RV32-NEXT:    sltu a1, s2, s1
-; RV32-NEXT:    add a1, a1, ra
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    add a2, s2, a2
-; RV32-NEXT:    sltu a3, a2, s2
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    vmv.x.s a3, v16
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vmv.x.s a1, v24
-; RV32-NEXT:    add a3, a2, a3
-; RV32-NEXT:    sltu a2, a3, a2
-; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    sltu a2, a0, a3
+; RV32-NEXT:    add a0, t5, ra
+; RV32-NEXT:    sltu a2, a0, t5
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
@@ -1166,56 +1003,52 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV64-NEXT:    .cfi_def_cfa_offset 256
 ; RV64-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 232(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s2, -24
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v16, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v16
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v16
+; RV64-NEXT:    vmv.x.s a0, v16
 ; RV64-NEXT:    vslidedown.vi v16, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v16
-; RV64-NEXT:    mv a4, sp
+; RV64-NEXT:    vmv.x.s a1, v16
+; RV64-NEXT:    mv a2, sp
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vse64.v v8, (a4)
-; RV64-NEXT:    ld a4, 32(sp)
-; RV64-NEXT:    ld a5, 40(sp)
-; RV64-NEXT:    ld a6, 48(sp)
-; RV64-NEXT:    ld a7, 56(sp)
-; RV64-NEXT:    ld t0, 64(sp)
-; RV64-NEXT:    ld t1, 72(sp)
-; RV64-NEXT:    ld t2, 80(sp)
-; RV64-NEXT:    ld t3, 88(sp)
-; RV64-NEXT:    ld t4, 96(sp)
-; RV64-NEXT:    ld t5, 104(sp)
-; RV64-NEXT:    ld t6, 112(sp)
-; RV64-NEXT:    ld s2, 120(sp)
+; RV64-NEXT:    vse64.v v8, (a2)
+; RV64-NEXT:    ld a2, 32(sp)
+; RV64-NEXT:    ld a3, 40(sp)
+; RV64-NEXT:    ld a4, 48(sp)
+; RV64-NEXT:    ld a5, 56(sp)
+; RV64-NEXT:    ld a6, 64(sp)
+; RV64-NEXT:    ld a7, 72(sp)
+; RV64-NEXT:    ld t0, 80(sp)
+; RV64-NEXT:    ld t1, 88(sp)
+; RV64-NEXT:    ld t2, 96(sp)
+; RV64-NEXT:    ld t3, 104(sp)
+; RV64-NEXT:    ld t4, 112(sp)
+; RV64-NEXT:    ld t5, 120(sp)
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v9
+; RV64-NEXT:    vmv.x.s t6, v8
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    add a0, t6, a0
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    add a0, a0, a3
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    add a6, a6, t0
-; RV64-NEXT:    add a0, a0, a6
-; RV64-NEXT:    add t1, t1, t2
-; RV64-NEXT:    add t1, t1, t3
-; RV64-NEXT:    add t1, t1, t4
-; RV64-NEXT:    add a0, a0, t1
-; RV64-NEXT:    add t5, t5, t6
-; RV64-NEXT:    add t5, t5, s2
-; RV64-NEXT:    add a0, a0, t5
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a5, a5, a7
+; RV64-NEXT:    add a0, a0, a5
+; RV64-NEXT:    add t0, t0, t1
+; RV64-NEXT:    add t0, t0, t2
+; RV64-NEXT:    add t0, t0, t3
+; RV64-NEXT:    add a0, a0, t0
+; RV64-NEXT:    add t4, t4, t5
+; RV64-NEXT:    add a0, a0, t4
 ; RV64-NEXT:    addi sp, s0, -256
 ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 232(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 256
 ; RV64-NEXT:    ret
   %e0 = extractelement <16 x i64> %v, i32 0
@@ -1234,8 +1067,8 @@ define i64 @explode_16xi64(<16 x i64> %v) {
   %e13 = extractelement <16 x i64> %v, i32 13
   %e14 = extractelement <16 x i64> %v, i32 14
   %e15 = extractelement <16 x i64> %v, i32 15
-  %add0 = add i64 %e0, %e1
-  %add1 = xor i64 %add0, %e2
+  %add0 = xor i64 %e0, %e1
+  %add1 = add i64 %add0, %e2
   %add2 = add i64 %add1, %e3
   %add3 = add i64 %add2, %e4
   %add4 = add i64 %add3, %e5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 5ef6b291e30951..8c96392f08a5db 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -1,25 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 define i32 @reduce_sum_2xi32(<2 x i32> %v) {
-; RV32-LABEL: reduce_sum_2xi32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_2xi32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    addw a0, a0, a1
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_2xi32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %e0 = extractelement <2 x i32> %v, i32 0
   %e1 = extractelement <2 x i32> %v, i32 1
   %add0 = add i32 %e0, %e1
@@ -27,35 +17,13 @@ define i32 @reduce_sum_2xi32(<2 x i32> %v) {
 }
 
 define i32 @reduce_sum_4xi32(<4 x i32> %v) {
-; RV32-LABEL: reduce_sum_4xi32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v9
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_4xi32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v9
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    addw a0, a0, a2
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_4xi32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
   %e2 = extractelement <4 x i32> %v, i32 2
@@ -68,61 +36,13 @@ define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 
 
 define i32 @reduce_sum_8xi32(<8 x i32> %v) {
-; RV32-LABEL: reduce_sum_8xi32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v10
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_8xi32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v10
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v10
-; RV64-NEXT:    vslidedown.vi v8, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    addw a0, a0, a7
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_8xi32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %e0 = extractelement <8 x i32> %v, i32 0
   %e1 = extractelement <8 x i32> %v, i32 1
   %e2 = extractelement <8 x i32> %v, i32 2
@@ -142,131 +62,13 @@ define i32 @reduce_sum_8xi32(<8 x i32> %v) {
 }
 
 define i32 @reduce_sum_16xi32(<16 x i32> %v) {
-; RV32-LABEL: reduce_sum_16xi32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 116(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s2, -12
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v12
-; RV32-NEXT:    mv t0, sp
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (t0)
-; RV32-NEXT:    lw t0, 32(sp)
-; RV32-NEXT:    lw t1, 36(sp)
-; RV32-NEXT:    lw t2, 40(sp)
-; RV32-NEXT:    lw t3, 44(sp)
-; RV32-NEXT:    lw t4, 48(sp)
-; RV32-NEXT:    lw t5, 52(sp)
-; RV32-NEXT:    lw t6, 56(sp)
-; RV32-NEXT:    lw s2, 60(sp)
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    add t1, t1, t2
-; RV32-NEXT:    add t1, t1, t3
-; RV32-NEXT:    add a0, a0, t1
-; RV32-NEXT:    add t4, t4, t5
-; RV32-NEXT:    add t4, t4, t6
-; RV32-NEXT:    add t4, t4, s2
-; RV32-NEXT:    add a0, a0, t4
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 116(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 104(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s2, -24
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v12
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v12
-; RV64-NEXT:    mv t0, sp
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (t0)
-; RV64-NEXT:    lw t0, 32(sp)
-; RV64-NEXT:    lw t1, 36(sp)
-; RV64-NEXT:    lw t2, 40(sp)
-; RV64-NEXT:    lw t3, 44(sp)
-; RV64-NEXT:    lw t4, 48(sp)
-; RV64-NEXT:    lw t5, 52(sp)
-; RV64-NEXT:    lw t6, 56(sp)
-; RV64-NEXT:    lw s2, 60(sp)
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a0, a0, a7
-; RV64-NEXT:    add t1, t1, t2
-; RV64-NEXT:    add t1, t1, t3
-; RV64-NEXT:    add a0, a0, t1
-; RV64-NEXT:    add t4, t4, t5
-; RV64-NEXT:    add t4, t4, t6
-; RV64-NEXT:    add t4, t4, s2
-; RV64-NEXT:    addw a0, a0, t4
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 104(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v12
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
   %e2 = extractelement <16 x i32> %v, i32 2
@@ -302,27 +104,14 @@ define i32 @reduce_sum_16xi32(<16 x i32> %v) {
 }
 
 define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix2:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix2:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    addw a0, a0, a1
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -331,33 +120,15 @@ define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix3:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vslidedown.vi v8, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix3:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v9
-; RV64-NEXT:    vslidedown.vi v8, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    addw a0, a0, a2
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vslideup.vi v8, v9, 3
+; CHECK-NEXT:    vredsum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -368,39 +139,14 @@ define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix4:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v9
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix4:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v9
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    addw a0, a0, a2
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -413,47 +159,21 @@ define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix5:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix5:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v10
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    addw a0, a0, a4
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 224
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv.v.i v8, -1
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsext.vf4 v12, v8
+; CHECK-NEXT:    vand.vv v8, v10, v12
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -468,53 +188,21 @@ define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix6:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vslidedown.vi v8, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix6:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v10
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v10
-; RV64-NEXT:    vslidedown.vi v8, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    addw a0, a0, a4
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 192
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv.v.i v8, -1
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsext.vf4 v12, v8
+; CHECK-NEXT:    vand.vv v8, v10, v12
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -531,59 +219,15 @@ define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix7:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v10
-; RV32-NEXT:    vslidedown.vi v8, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix7:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v10
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v10
-; RV64-NEXT:    vslidedown.vi v8, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    addw a0, a0, a4
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vslideup.vi v8, v10, 7
+; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -602,65 +246,14 @@ define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v10
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v10
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v10
-; RV64-NEXT:    vslidedown.vi v8, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    addw a0, a0, a7
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -681,101 +274,22 @@ define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix9:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v12
-; RV32-NEXT:    mv t0, sp
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (t0)
-; RV32-NEXT:    lw t0, 32(sp)
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix9:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v12
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v12
-; RV64-NEXT:    mv t0, sp
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (t0)
-; RV64-NEXT:    lw t0, 32(sp)
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    addw a0, a0, a7
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    li a0, -512
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, -1
+; CHECK-NEXT:    vmerge.vim v12, v12, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsext.vf4 v16, v12
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v12
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -798,117 +312,22 @@ define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix13:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v12
-; RV32-NEXT:    mv t0, sp
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (t0)
-; RV32-NEXT:    lw t0, 32(sp)
-; RV32-NEXT:    lw t1, 36(sp)
-; RV32-NEXT:    lw t2, 40(sp)
-; RV32-NEXT:    lw t3, 44(sp)
-; RV32-NEXT:    lw t4, 48(sp)
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, a7, t1
-; RV32-NEXT:    add a7, a7, t2
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    add t3, t3, t4
-; RV32-NEXT:    add a0, a0, t3
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix13:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v12
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v12
-; RV64-NEXT:    mv t0, sp
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (t0)
-; RV64-NEXT:    lw t0, 32(sp)
-; RV64-NEXT:    lw t1, 36(sp)
-; RV64-NEXT:    lw t2, 40(sp)
-; RV64-NEXT:    lw t3, 44(sp)
-; RV64-NEXT:    lw t4, 48(sp)
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
-; RV64-NEXT:    add a7, a7, t2
-; RV64-NEXT:    add a0, a0, a7
-; RV64-NEXT:    add t3, t3, t4
-; RV64-NEXT:    addw a0, a0, t3
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    lui a0, 14
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, -1
+; CHECK-NEXT:    vmerge.vim v12, v12, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsext.vf4 v16, v12
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v12
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -940,121 +359,22 @@ define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
 
 
 define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix14:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v12
-; RV32-NEXT:    mv t0, sp
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (t0)
-; RV32-NEXT:    lw t0, 32(sp)
-; RV32-NEXT:    lw t1, 36(sp)
-; RV32-NEXT:    lw t2, 40(sp)
-; RV32-NEXT:    lw t3, 44(sp)
-; RV32-NEXT:    lw t4, 48(sp)
-; RV32-NEXT:    lw t5, 52(sp)
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, a7, t1
-; RV32-NEXT:    add a7, a7, t2
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    add t3, t3, t4
-; RV32-NEXT:    add t3, t3, t5
-; RV32-NEXT:    add a0, a0, t3
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix14:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v12
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v12
-; RV64-NEXT:    mv t0, sp
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (t0)
-; RV64-NEXT:    lw t0, 32(sp)
-; RV64-NEXT:    lw t1, 36(sp)
-; RV64-NEXT:    lw t2, 40(sp)
-; RV64-NEXT:    lw t3, 44(sp)
-; RV64-NEXT:    lw t4, 48(sp)
-; RV64-NEXT:    lw t5, 52(sp)
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
-; RV64-NEXT:    add a7, a7, t2
-; RV64-NEXT:    add a0, a0, a7
-; RV64-NEXT:    add t3, t3, t4
-; RV64-NEXT:    add t3, t3, t5
-; RV64-NEXT:    addw a0, a0, t3
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    lui a0, 12
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, -1
+; CHECK-NEXT:    vmerge.vim v12, v12, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsext.vf4 v16, v12
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v12
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -1087,125 +407,15 @@ define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
 }
 
 define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
-; RV32-LABEL: reduce_sum_16xi32_prefix15:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a4, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vmv.x.s a7, v12
-; RV32-NEXT:    mv t0, sp
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (t0)
-; RV32-NEXT:    lw t0, 32(sp)
-; RV32-NEXT:    lw t1, 36(sp)
-; RV32-NEXT:    lw t2, 40(sp)
-; RV32-NEXT:    lw t3, 44(sp)
-; RV32-NEXT:    lw t4, 48(sp)
-; RV32-NEXT:    lw t5, 52(sp)
-; RV32-NEXT:    lw t6, 56(sp)
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, a7, t1
-; RV32-NEXT:    add a7, a7, t2
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    add t3, t3, t4
-; RV32-NEXT:    add t3, t3, t5
-; RV32-NEXT:    add t3, t3, t6
-; RV32-NEXT:    add a0, a0, t3
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_sum_16xi32_prefix15:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a3, v12
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a4, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a5, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a6, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 7
-; RV64-NEXT:    vmv.x.s a7, v12
-; RV64-NEXT:    mv t0, sp
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (t0)
-; RV64-NEXT:    lw t0, 32(sp)
-; RV64-NEXT:    lw t1, 36(sp)
-; RV64-NEXT:    lw t2, 40(sp)
-; RV64-NEXT:    lw t3, 44(sp)
-; RV64-NEXT:    lw t4, 48(sp)
-; RV64-NEXT:    lw t5, 52(sp)
-; RV64-NEXT:    lw t6, 56(sp)
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    add a4, a4, a6
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
-; RV64-NEXT:    add a7, a7, t2
-; RV64-NEXT:    add a0, a0, a7
-; RV64-NEXT:    add t3, t3, t4
-; RV64-NEXT:    add t3, t3, t5
-; RV64-NEXT:    add t3, t3, t6
-; RV64-NEXT:    addw a0, a0, t3
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: reduce_sum_16xi32_prefix15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vslideup.vi v8, v12, 15
+; CHECK-NEXT:    vredsum.vs v8, v8, v12
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
   %e0 = extractelement <16 x i32> %v, i32 0
   %e1 = extractelement <16 x i32> %v, i32 1
@@ -1239,5 +449,335 @@ define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
   ret i32 %add13
 }
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
+;; Most of the cornercases are exercised above, the following just
+;; makes sure that other opcodes work as expected.
+
+define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
+; CHECK-LABEL: reduce_xor_16xi32_prefix2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %xor0 = xor i32 %e0, %e1
+  ret i32 %xor0
+}
+
+define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
+; CHECK-LABEL: reduce_xor_16xi32_prefix5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 224
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv.v.i v8, -1
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsext.vf4 v12, v8
+; CHECK-NEXT:    vand.vv v8, v10, v12
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vredxor.vs v8, v8, v10
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %xor0 = xor i32 %e0, %e1
+  %xor1 = xor i32 %xor0, %e2
+  %xor2 = xor i32 %xor1, %e3
+  %xor3 = xor i32 %xor2, %e4
+  ret i32 %xor3
+}
+
+define i32 @reduce_and_16xi32_prefix2(ptr %p) {
+; CHECK-LABEL: reduce_and_16xi32_prefix2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vredand.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %and0 = and i32 %e0, %e1
+  ret i32 %and0
+}
+
+define i32 @reduce_and_16xi32_prefix5(ptr %p) {
+; CHECK-LABEL: reduce_and_16xi32_prefix5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, -1
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v10, v8, 7
+; CHECK-NEXT:    vredand.vs v8, v10, v10
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %and0 = and i32 %e0, %e1
+  %and1 = and i32 %and0, %e2
+  %and2 = and i32 %and1, %e3
+  %and3 = and i32 %and2, %e4
+  ret i32 %and3
+}
+
+define i32 @reduce_or_16xi32_prefix2(ptr %p) {
+; CHECK-LABEL: reduce_or_16xi32_prefix2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vredor.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %or0 = or i32 %e0, %e1
+  ret i32 %or0
+}
+
+define i32 @reduce_or_16xi32_prefix5(ptr %p) {
+; CHECK-LABEL: reduce_or_16xi32_prefix5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 224
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv.v.i v8, -1
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsext.vf4 v12, v8
+; CHECK-NEXT:    vand.vv v8, v10, v12
+; CHECK-NEXT:    vredor.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %or0 = or i32 %e0, %e1
+  %or1 = or i32 %or0, %e2
+  %or2 = or i32 %or1, %e3
+  %or3 = or i32 %or2, %e4
+  ret i32 %or3
+}
+
+declare i32 @llvm.smax.i32(i32 %a, i32 %b)
+declare i32 @llvm.smin.i32(i32 %a, i32 %b)
+declare i32 @llvm.umax.i32(i32 %a, i32 %b)
+declare i32 @llvm.umin.i32(i32 %a, i32 %b)
+
+define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
+; CHECK-LABEL: reduce_smax_16xi32_prefix2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vredmax.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
+  ret i32 %smax0
+}
+
+define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
+; CHECK-LABEL: reduce_smax_16xi32_prefix5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v10, a1
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 5
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 6
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 7
+; CHECK-NEXT:    vredmax.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
+  %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2)
+  %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3)
+  %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4)
+  ret i32 %smax3
+}
+
+define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
+; CHECK-LABEL: reduce_smin_16xi32_prefix2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vredmin.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
+  ret i32 %smin0
+}
+
+define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
+; RV32-LABEL: reduce_smin_16xi32_prefix5:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 524288
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vmv.s.x v10, a1
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v8, v10, 5
+; RV32-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v8, v10, 6
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vslideup.vi v8, v10, 7
+; RV32-NEXT:    vredmin.vs v8, v8, v8
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: reduce_smin_16xi32_prefix5:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 524288
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vmv.s.x v10, a1
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; RV64-NEXT:    vslideup.vi v8, v10, 5
+; RV64-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; RV64-NEXT:    vslideup.vi v8, v10, 6
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vslideup.vi v8, v10, 7
+; RV64-NEXT:    vredmin.vs v8, v8, v8
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
+  %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2)
+  %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3)
+  %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4)
+  ret i32 %smin3
+}
+
+define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
+; CHECK-LABEL: reduce_umax_16xi32_prefix2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
+  ret i32 %umax0
+}
+
+define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
+; CHECK-LABEL: reduce_umax_16xi32_prefix5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 224
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv.v.i v8, -1
+; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsext.vf4 v12, v8
+; CHECK-NEXT:    vand.vv v8, v10, v12
+; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
+  %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2)
+  %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3)
+  %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4)
+  ret i32 %umax3
+}
+
+define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
+; CHECK-LABEL: reduce_umin_16xi32_prefix2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vredminu.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
+  ret i32 %umin0
+}
+
+define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
+; CHECK-LABEL: reduce_umin_16xi32_prefix5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, -1
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v10, v8, 7
+; CHECK-NEXT:    vredminu.vs v8, v10, v10
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i32>, ptr %p, align 256
+  %e0 = extractelement <16 x i32> %v, i32 0
+  %e1 = extractelement <16 x i32> %v, i32 1
+  %e2 = extractelement <16 x i32> %v, i32 2
+  %e3 = extractelement <16 x i32> %v, i32 3
+  %e4 = extractelement <16 x i32> %v, i32 4
+  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
+  %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2)
+  %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3)
+  %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4)
+  ret i32 %umin3
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
index 4b7bfba06377da..49cd4b85693b60 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB-V
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB-V
-; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB-ZVE32X
-; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB-ZVE32X
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-V
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-V
+; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
+; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
 
 define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) {
 ; CHECK-LABEL: shuffle_v8i1_as_i8_1:
@@ -15,17 +15,17 @@ define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) {
 ; CHECK-NEXT:    vor.vv v0, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i1_as_i8_1:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; ZVBB-V-NEXT:    vror.vi v0, v0, 1
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i1_as_i8_1:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v0, v0, 1
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i1_as_i8_1:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; ZVKB-V-NEXT:    vror.vi v0, v0, 1
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i1_as_i8_1:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v0, v0, 1
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
   ret <8 x i1> %shuffle
 }
@@ -39,17 +39,17 @@ define <8 x i1> @shuffle_v8i1_as_i8_2(<8 x i1> %v) {
 ; CHECK-NEXT:    vor.vv v0, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i1_as_i8_2:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; ZVBB-V-NEXT:    vror.vi v0, v0, 2
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i1_as_i8_2:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v0, v0, 2
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i1_as_i8_2:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; ZVKB-V-NEXT:    vror.vi v0, v0, 2
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i1_as_i8_2:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v0, v0, 2
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
   ret <8 x i1> %shuffle
 }
@@ -63,17 +63,17 @@ define <8 x i1> @shuffle_v8i1_as_i8_3(<8 x i1> %v) {
 ; CHECK-NEXT:    vor.vv v0, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i1_as_i8_3:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; ZVBB-V-NEXT:    vror.vi v0, v0, 3
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i1_as_i8_3:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v0, v0, 3
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i1_as_i8_3:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; ZVKB-V-NEXT:    vror.vi v0, v0, 3
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i1_as_i8_3:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v0, v0, 3
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
   ret <8 x i1> %shuffle
 }
@@ -87,17 +87,17 @@ define <8 x i1> @shuffle_v8i1_as_i8_4(<8 x i1> %v) {
 ; CHECK-NEXT:    vor.vv v0, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i1_as_i8_4:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; ZVBB-V-NEXT:    vror.vi v0, v0, 4
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i1_as_i8_4:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v0, v0, 4
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i1_as_i8_4:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; ZVKB-V-NEXT:    vror.vi v0, v0, 4
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i1_as_i8_4:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v0, v0, 4
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i1> %shuffle
 }
@@ -111,17 +111,17 @@ define <8 x i1> @shuffle_v8i1_as_i8_5(<8 x i1> %v) {
 ; CHECK-NEXT:    vor.vv v0, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i1_as_i8_5:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; ZVBB-V-NEXT:    vror.vi v0, v0, 5
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i1_as_i8_5:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v0, v0, 5
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i1_as_i8_5:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; ZVKB-V-NEXT:    vror.vi v0, v0, 5
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i1_as_i8_5:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v0, v0, 5
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
   ret <8 x i1> %shuffle
 }
@@ -135,17 +135,17 @@ define <8 x i1> @shuffle_v8i1_as_i8_6(<8 x i1> %v) {
 ; CHECK-NEXT:    vor.vv v0, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i1_as_i8_6:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; ZVBB-V-NEXT:    vror.vi v0, v0, 6
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i1_as_i8_6:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v0, v0, 6
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i1_as_i8_6:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; ZVKB-V-NEXT:    vror.vi v0, v0, 6
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i1_as_i8_6:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v0, v0, 6
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
   ret <8 x i1> %shuffle
 }
@@ -159,17 +159,17 @@ define <8 x i1> @shuffle_v8i1_as_i8_7(<8 x i1> %v) {
 ; CHECK-NEXT:    vor.vv v0, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i1_as_i8_7:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; ZVBB-V-NEXT:    vror.vi v0, v0, 7
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i1_as_i8_7:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v0, v0, 7
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i1_as_i8_7:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; ZVKB-V-NEXT:    vror.vi v0, v0, 7
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i1_as_i8_7:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v0, v0, 7
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
   ret <8 x i1> %shuffle
 }
@@ -183,17 +183,17 @@ define <8 x i8> @shuffle_v8i8_as_i16(<8 x i8> %v) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i16:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVBB-V-NEXT:    vrev8.v v8, v8
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i16:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 4, e16, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vrev8.v v8, v8
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i16:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVKB-V-NEXT:    vrev8.v v8, v8
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i16:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrev8.v v8, v8
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x i8> %shuffle
 }
@@ -207,17 +207,17 @@ define <8 x i8> @shuffle_v8i8_as_i32_8(<8 x i8> %v) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i32_8:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 8
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i32_8:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v8, v8, 8
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i32_8:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 8
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_8:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 8
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
   ret <8 x i8> %shuffle
 }
@@ -231,17 +231,17 @@ define <8 x i8> @shuffle_v8i8_as_i32_16(<8 x i8> %v) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i32_16:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 16
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i32_16:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v8, v8, 16
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i32_16:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 16
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_16:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   ret <8 x i8> %shuffle
 }
@@ -255,17 +255,17 @@ define <8 x i8> @shuffle_v8i8_as_i32_24(<8 x i8> %v) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i32_24:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 24
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i32_24:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v8, v8, 24
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i32_24:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 24
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_24:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 24
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
   ret <8 x i8> %shuffle
 }
@@ -279,19 +279,19 @@ define <8 x i8> @shuffle_v8i8_as_i64_8(<8 x i8> %v) {
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i64_8:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 8
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i64_8:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 1
-; ZVBB-ZVE32X-NEXT:    vslideup.vi v10, v8, 7
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v10
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i64_8:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 8
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_8:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 1
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 7
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
   ret <8 x i8> %shuffle
 }
@@ -305,19 +305,19 @@ define <8 x i8> @shuffle_v8i8_as_i64_16(<8 x i8> %v) {
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i64_16:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 16
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i64_16:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 2
-; ZVBB-ZVE32X-NEXT:    vslideup.vi v10, v8, 6
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v10
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i64_16:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 16
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_16:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 2
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 6
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
   ret <8 x i8> %shuffle
 }
@@ -331,19 +331,19 @@ define <8 x i8> @shuffle_v8i8_as_i64_24(<8 x i8> %v) {
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i64_24:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 24
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i64_24:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 3
-; ZVBB-ZVE32X-NEXT:    vslideup.vi v10, v8, 5
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v10
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i64_24:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 24
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_24:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 3
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 5
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
   ret <8 x i8> %shuffle
 }
@@ -357,19 +357,19 @@ define <8 x i8> @shuffle_v8i8_as_i64_32(<8 x i8> %v) {
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i64_32:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 32
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i64_32:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 4
-; ZVBB-ZVE32X-NEXT:    vslideup.vi v10, v8, 4
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v10
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i64_32:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 32
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_32:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 4
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 4
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i8> %shuffle
 }
@@ -383,19 +383,19 @@ define <8 x i8> @shuffle_v8i8_as_i64_40(<8 x i8> %v) {
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i64_40:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 40
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i64_40:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 5
-; ZVBB-ZVE32X-NEXT:    vslideup.vi v10, v8, 3
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v10
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i64_40:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 40
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_40:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 5
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 3
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
   ret <8 x i8> %shuffle
 }
@@ -409,19 +409,19 @@ define <8 x i8> @shuffle_v8i8_as_i64_48(<8 x i8> %v) {
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i64_48:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 48
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i64_48:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 6
-; ZVBB-ZVE32X-NEXT:    vslideup.vi v10, v8, 2
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v10
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i64_48:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 48
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_48:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 6
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 2
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
   ret <8 x i8> %shuffle
 }
@@ -435,19 +435,19 @@ define <8 x i8> @shuffle_v8i8_as_i64_56(<8 x i8> %v) {
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i8_as_i64_56:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 56
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i8_as_i64_56:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVBB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 7
-; ZVBB-ZVE32X-NEXT:    vslideup.vi v10, v8, 1
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v10
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i8_as_i64_56:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 56
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_56:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 7
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 1
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
   ret <8 x i8> %shuffle
 }
@@ -461,17 +461,17 @@ define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i16_as_i32:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 16
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i16_as_i32:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v8, v8, 16
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i16_as_i32:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 16
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i32:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x i16> %shuffle
 }
@@ -500,22 +500,22 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) {
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i16_as_i64_16:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 16
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i16_as_i64_16:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    lui a0, %hi(.LCPI19_0)
-; ZVBB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI19_0)
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVBB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVBB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v12
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i16_as_i64_16:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 16
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i64_16:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI19_0)
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
   ret <8 x i16> %shuffle
 }
@@ -544,22 +544,22 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i16_as_i64_32:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 32
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i16_as_i64_32:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    lui a0, %hi(.LCPI20_0)
-; ZVBB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI20_0)
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVBB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVBB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v12
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i16_as_i64_32:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 32
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i64_32:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI20_0)
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI20_0)
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   ret <8 x i16> %shuffle
 }
@@ -588,22 +588,22 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i16_as_i64_48:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 48
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i16_as_i64_48:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    lui a0, %hi(.LCPI21_0)
-; ZVBB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVBB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVBB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v12
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i16_as_i64_48:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 48
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i64_48:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI21_0)
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI21_0)
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
   ret <8 x i16> %shuffle
 }
@@ -632,23 +632,23 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) {
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8i32_as_i64:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 32
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8i32_as_i64:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    lui a0, %hi(.LCPI22_0)
-; ZVBB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI22_0)
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vle8.v v16, (a0)
-; ZVBB-ZVE32X-NEXT:    vsext.vf2 v24, v16
-; ZVBB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVBB-ZVE32X-NEXT:    vrgatherei16.vv v16, v8, v24
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v16
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8i32_as_i64:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 32
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8i32_as_i64:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI22_0)
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI22_0)
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v16, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v24, v16
+; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v16, v8, v24
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v16
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x i32> %shuffle
 }
@@ -662,17 +662,17 @@ define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8f16_as_i32:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 16
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8f16_as_i32:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vror.vi v8, v8, 16
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8f16_as_i32:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 16
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i32:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x half> %shuffle
 }
@@ -701,22 +701,22 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) {
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8f16_as_i64_16:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 16
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8f16_as_i64_16:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    lui a0, %hi(.LCPI24_0)
-; ZVBB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI24_0)
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVBB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVBB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v12
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8f16_as_i64_16:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 16
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i64_16:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI24_0)
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI24_0)
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
   ret <8 x half> %shuffle
 }
@@ -745,22 +745,22 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8f16_as_i64_32:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 32
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8f16_as_i64_32:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    lui a0, %hi(.LCPI25_0)
-; ZVBB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI25_0)
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVBB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVBB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v12
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8f16_as_i64_32:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 32
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i64_32:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI25_0)
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI25_0)
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   ret <8 x half> %shuffle
 }
@@ -789,22 +789,22 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8f16_as_i64_48:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 48
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8f16_as_i64_48:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    lui a0, %hi(.LCPI26_0)
-; ZVBB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI26_0)
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVBB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVBB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v12
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8f16_as_i64_48:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 48
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i64_48:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI26_0)
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI26_0)
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
   ret <8 x half> %shuffle
 }
@@ -833,23 +833,23 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) {
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
 ;
-; ZVBB-V-LABEL: shuffle_v8f32_as_i64:
-; ZVBB-V:       # %bb.0:
-; ZVBB-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; ZVBB-V-NEXT:    vror.vi v8, v8, 32
-; ZVBB-V-NEXT:    ret
-;
-; ZVBB-ZVE32X-LABEL: shuffle_v8f32_as_i64:
-; ZVBB-ZVE32X:       # %bb.0:
-; ZVBB-ZVE32X-NEXT:    lui a0, %hi(.LCPI27_0)
-; ZVBB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI27_0)
-; ZVBB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVBB-ZVE32X-NEXT:    vle8.v v16, (a0)
-; ZVBB-ZVE32X-NEXT:    vsext.vf2 v24, v16
-; ZVBB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVBB-ZVE32X-NEXT:    vrgatherei16.vv v16, v8, v24
-; ZVBB-ZVE32X-NEXT:    vmv.v.v v8, v16
-; ZVBB-ZVE32X-NEXT:    ret
+; ZVKB-V-LABEL: shuffle_v8f32_as_i64:
+; ZVKB-V:       # %bb.0:
+; ZVKB-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; ZVKB-V-NEXT:    vror.vi v8, v8, 32
+; ZVKB-V-NEXT:    ret
+;
+; ZVKB-ZVE32X-LABEL: shuffle_v8f32_as_i64:
+; ZVKB-ZVE32X:       # %bb.0:
+; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI27_0)
+; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI27_0)
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v16, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v24, v16
+; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v16, v8, v24
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v16
+; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x float> %v, <8 x float> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x float> %shuffle
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
index 53957586a8c629..99a933985a351d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
 
 declare <1 x i8> @llvm.fshl.v1i8(<1 x i8>, <1 x i8>, <1 x i8>)
 
@@ -18,11 +18,11 @@ define <1 x i8> @vrol_vv_v1i8(<1 x i8> %a, <1 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i8> @llvm.fshl.v1i8(<1 x i8> %a, <1 x i8> %a, <1 x i8> %b)
   ret <1 x i8> %x
 }
@@ -40,11 +40,11 @@ define <1 x i8> @vrol_vx_v1i8(<1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <1 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <1 x i8> %b.head, <1 x i8> poison, <1 x i32> zeroinitializer
   %x = call <1 x i8> @llvm.fshl.v1i8(<1 x i8> %a, <1 x i8> %a, <1 x i8> %b.splat)
@@ -65,11 +65,11 @@ define <2 x i8> @vrol_vv_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i8> @llvm.fshl.v2i8(<2 x i8> %a, <2 x i8> %a, <2 x i8> %b)
   ret <2 x i8> %x
 }
@@ -87,11 +87,11 @@ define <2 x i8> @vrol_vx_v2i8(<2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <2 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <2 x i8> %b.head, <2 x i8> poison, <2 x i32> zeroinitializer
   %x = call <2 x i8> @llvm.fshl.v2i8(<2 x i8> %a, <2 x i8> %a, <2 x i8> %b.splat)
@@ -112,11 +112,11 @@ define <4 x i8> @vrol_vv_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %a, <4 x i8> %a, <4 x i8> %b)
   ret <4 x i8> %x
 }
@@ -134,11 +134,11 @@ define <4 x i8> @vrol_vx_v4i8(<4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <4 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <4 x i8> %b.head, <4 x i8> poison, <4 x i32> zeroinitializer
   %x = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %a, <4 x i8> %a, <4 x i8> %b.splat)
@@ -159,11 +159,11 @@ define <8 x i8> @vrol_vv_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i8> @llvm.fshl.v8i8(<8 x i8> %a, <8 x i8> %a, <8 x i8> %b)
   ret <8 x i8> %x
 }
@@ -181,11 +181,11 @@ define <8 x i8> @vrol_vx_v8i8(<8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <8 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <8 x i8> %b.head, <8 x i8> poison, <8 x i32> zeroinitializer
   %x = call <8 x i8> @llvm.fshl.v8i8(<8 x i8> %a, <8 x i8> %a, <8 x i8> %b.splat)
@@ -206,11 +206,11 @@ define <16 x i8> @vrol_vv_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %x
 }
@@ -228,11 +228,11 @@ define <16 x i8> @vrol_vx_v16i8(<16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <16 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <16 x i8> %b.head, <16 x i8> poison, <16 x i32> zeroinitializer
   %x = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %b.splat)
@@ -254,12 +254,12 @@ define <32 x i8> @vrol_vv_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> %b)
   ret <32 x i8> %x
 }
@@ -278,12 +278,12 @@ define <32 x i8> @vrol_vx_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a1, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <32 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <32 x i8> %b.head, <32 x i8> poison, <32 x i32> zeroinitializer
   %x = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> %b.splat)
@@ -305,12 +305,12 @@ define <64 x i8> @vrol_vv_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 64
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 64
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> %b)
   ret <64 x i8> %x
 }
@@ -329,12 +329,12 @@ define <64 x i8> @vrol_vx_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 64
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a1, 64
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <64 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <64 x i8> %b.head, <64 x i8> poison, <64 x i32> zeroinitializer
   %x = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> %b.splat)
@@ -355,11 +355,11 @@ define <1 x i16> @vrol_vv_v1i16(<1 x i16> %a, <1 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i16> @llvm.fshl.v1i16(<1 x i16> %a, <1 x i16> %a, <1 x i16> %b)
   ret <1 x i16> %x
 }
@@ -377,11 +377,11 @@ define <1 x i16> @vrol_vx_v1i16(<1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <1 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <1 x i16> %b.head, <1 x i16> poison, <1 x i32> zeroinitializer
   %x = call <1 x i16> @llvm.fshl.v1i16(<1 x i16> %a, <1 x i16> %a, <1 x i16> %b.splat)
@@ -402,11 +402,11 @@ define <2 x i16> @vrol_vv_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %a, <2 x i16> %a, <2 x i16> %b)
   ret <2 x i16> %x
 }
@@ -424,11 +424,11 @@ define <2 x i16> @vrol_vx_v2i16(<2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <2 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <2 x i16> %b.head, <2 x i16> poison, <2 x i32> zeroinitializer
   %x = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %a, <2 x i16> %a, <2 x i16> %b.splat)
@@ -449,11 +449,11 @@ define <4 x i16> @vrol_vv_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %a, <4 x i16> %a, <4 x i16> %b)
   ret <4 x i16> %x
 }
@@ -471,11 +471,11 @@ define <4 x i16> @vrol_vx_v4i16(<4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <4 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <4 x i16> %b.head, <4 x i16> poison, <4 x i32> zeroinitializer
   %x = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %a, <4 x i16> %a, <4 x i16> %b.splat)
@@ -496,11 +496,11 @@ define <8 x i16> @vrol_vv_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %x
 }
@@ -518,11 +518,11 @@ define <8 x i16> @vrol_vx_v8i16(<8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <8 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <8 x i16> %b.head, <8 x i16> poison, <8 x i32> zeroinitializer
   %x = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %b.splat)
@@ -543,11 +543,11 @@ define <16 x i16> @vrol_vv_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a, <16 x i16> %a, <16 x i16> %b)
   ret <16 x i16> %x
 }
@@ -565,11 +565,11 @@ define <16 x i16> @vrol_vx_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <16 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <16 x i16> %b.head, <16 x i16> poison, <16 x i32> zeroinitializer
   %x = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a, <16 x i16> %a, <16 x i16> %b.splat)
@@ -591,12 +591,12 @@ define <32 x i16> @vrol_vv_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a, <32 x i16> %a, <32 x i16> %b)
   ret <32 x i16> %x
 }
@@ -615,12 +615,12 @@ define <32 x i16> @vrol_vx_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a1, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <32 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <32 x i16> %b.head, <32 x i16> poison, <32 x i32> zeroinitializer
   %x = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a, <32 x i16> %a, <32 x i16> %b.splat)
@@ -642,11 +642,11 @@ define <1 x i32> @vrol_vv_v1i32(<1 x i32> %a, <1 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i32> @llvm.fshl.v1i32(<1 x i32> %a, <1 x i32> %a, <1 x i32> %b)
   ret <1 x i32> %x
 }
@@ -665,11 +665,11 @@ define <1 x i32> @vrol_vx_v1i32(<1 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <1 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <1 x i32> %b.head, <1 x i32> poison, <1 x i32> zeroinitializer
   %x = call <1 x i32> @llvm.fshl.v1i32(<1 x i32> %a, <1 x i32> %a, <1 x i32> %b.splat)
@@ -691,11 +691,11 @@ define <2 x i32> @vrol_vv_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %a, <2 x i32> %a, <2 x i32> %b)
   ret <2 x i32> %x
 }
@@ -714,11 +714,11 @@ define <2 x i32> @vrol_vx_v2i32(<2 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <2 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <2 x i32> %b.head, <2 x i32> poison, <2 x i32> zeroinitializer
   %x = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %a, <2 x i32> %a, <2 x i32> %b.splat)
@@ -740,11 +740,11 @@ define <4 x i32> @vrol_vv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %x
 }
@@ -763,11 +763,11 @@ define <4 x i32> @vrol_vx_v4i32(<4 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <4 x i32> %b.head, <4 x i32> poison, <4 x i32> zeroinitializer
   %x = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %b.splat)
@@ -789,11 +789,11 @@ define <8 x i32> @vrol_vv_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a, <8 x i32> %a, <8 x i32> %b)
   ret <8 x i32> %x
 }
@@ -812,11 +812,11 @@ define <8 x i32> @vrol_vx_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <8 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <8 x i32> %b.head, <8 x i32> poison, <8 x i32> zeroinitializer
   %x = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a, <8 x i32> %a, <8 x i32> %b.splat)
@@ -838,11 +838,11 @@ define <16 x i32> @vrol_vv_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a, <16 x i32> %a, <16 x i32> %b)
   ret <16 x i32> %x
 }
@@ -861,11 +861,11 @@ define <16 x i32> @vrol_vx_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <16 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <16 x i32> %b.head, <16 x i32> poison, <16 x i32> zeroinitializer
   %x = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a, <16 x i32> %a, <16 x i32> %b.splat)
@@ -887,11 +887,11 @@ define <1 x i64> @vrol_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %a, <1 x i64> %a, <1 x i64> %b)
   ret <1 x i64> %x
 }
@@ -910,11 +910,11 @@ define <1 x i64> @vrol_vx_v1i64(<1 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <1 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <1 x i64> %b.head, <1 x i64> poison, <1 x i32> zeroinitializer
   %x = call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %a, <1 x i64> %a, <1 x i64> %b.splat)
@@ -936,11 +936,11 @@ define <2 x i64> @vrol_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %b)
   ret <2 x i64> %x
 }
@@ -959,11 +959,11 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <2 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <2 x i64> %b.head, <2 x i64> poison, <2 x i32> zeroinitializer
   %x = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %b.splat)
@@ -985,11 +985,11 @@ define <4 x i64> @vrol_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a, <4 x i64> %a, <4 x i64> %b)
   ret <4 x i64> %x
 }
@@ -1008,11 +1008,11 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <4 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <4 x i64> %b.head, <4 x i64> poison, <4 x i32> zeroinitializer
   %x = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a, <4 x i64> %a, <4 x i64> %b.splat)
@@ -1034,11 +1034,11 @@ define <8 x i64> @vrol_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_v8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_v8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a, <8 x i64> %a, <8 x i64> %b)
   ret <8 x i64> %x
 }
@@ -1057,16 +1057,13 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_v8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_v8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <8 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <8 x i64> %b.head, <8 x i64> poison, <8 x i32> zeroinitializer
   %x = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a, <8 x i64> %a, <8 x i64> %b.splat)
   ret <8 x i64> %x
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-RV32: {{.*}}
-; CHECK-RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
index 8de463af78d6ff..aa63013eec1202 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
 
 declare <1 x i8> @llvm.fshr.v1i8(<1 x i8>, <1 x i8>, <1 x i8>)
 declare <1 x i8> @llvm.fshl.v1i8(<1 x i8>, <1 x i8>, <1 x i8>)
@@ -19,11 +19,11 @@ define <1 x i8> @vror_vv_v1i8(<1 x i8> %a, <1 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i8> @llvm.fshr.v1i8(<1 x i8> %a, <1 x i8> %a, <1 x i8> %b)
   ret <1 x i8> %x
 }
@@ -41,11 +41,11 @@ define <1 x i8> @vror_vx_v1i8(<1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <1 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <1 x i8> %b.head, <1 x i8> poison, <1 x i32> zeroinitializer
   %x = call <1 x i8> @llvm.fshr.v1i8(<1 x i8> %a, <1 x i8> %a, <1 x i8> %b.splat)
@@ -61,11 +61,11 @@ define <1 x i8> @vror_vi_v1i8(<1 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i8> @llvm.fshr.v1i8(<1 x i8> %a, <1 x i8> %a, <1 x i8> shufflevector(<1 x i8> insertelement(<1 x i8> poison, i8 1, i32 0), <1 x i8> poison, <1 x i32> zeroinitializer))
   ret <1 x i8> %x
 }
@@ -79,11 +79,11 @@ define <1 x i8> @vror_vi_rotl_v1i8(<1 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i8> @llvm.fshl.v1i8(<1 x i8> %a, <1 x i8> %a, <1 x i8> shufflevector(<1 x i8> insertelement(<1 x i8> poison, i8 1, i32 0), <1 x i8> poison, <1 x i32> zeroinitializer))
   ret <1 x i8> %x
 }
@@ -103,11 +103,11 @@ define <2 x i8> @vror_vv_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> %a, <2 x i8> %a, <2 x i8> %b)
   ret <2 x i8> %x
 }
@@ -125,11 +125,11 @@ define <2 x i8> @vror_vx_v2i8(<2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <2 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <2 x i8> %b.head, <2 x i8> poison, <2 x i32> zeroinitializer
   %x = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> %a, <2 x i8> %a, <2 x i8> %b.splat)
@@ -145,11 +145,11 @@ define <2 x i8> @vror_vi_v2i8(<2 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> %a, <2 x i8> %a, <2 x i8> shufflevector(<2 x i8> insertelement(<2 x i8> poison, i8 1, i32 0), <2 x i8> poison, <2 x i32> zeroinitializer))
   ret <2 x i8> %x
 }
@@ -163,11 +163,11 @@ define <2 x i8> @vror_vi_rotl_v2i8(<2 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i8> @llvm.fshl.v2i8(<2 x i8> %a, <2 x i8> %a, <2 x i8> shufflevector(<2 x i8> insertelement(<2 x i8> poison, i8 1, i32 0), <2 x i8> poison, <2 x i32> zeroinitializer))
   ret <2 x i8> %x
 }
@@ -187,11 +187,11 @@ define <4 x i8> @vror_vv_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %a, <4 x i8> %a, <4 x i8> %b)
   ret <4 x i8> %x
 }
@@ -209,11 +209,11 @@ define <4 x i8> @vror_vx_v4i8(<4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <4 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <4 x i8> %b.head, <4 x i8> poison, <4 x i32> zeroinitializer
   %x = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %a, <4 x i8> %a, <4 x i8> %b.splat)
@@ -229,11 +229,11 @@ define <4 x i8> @vror_vi_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %a, <4 x i8> %a, <4 x i8> shufflevector(<4 x i8> insertelement(<4 x i8> poison, i8 1, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer))
   ret <4 x i8> %x
 }
@@ -247,11 +247,11 @@ define <4 x i8> @vror_vi_rotl_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %a, <4 x i8> %a, <4 x i8> shufflevector(<4 x i8> insertelement(<4 x i8> poison, i8 1, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer))
   ret <4 x i8> %x
 }
@@ -271,11 +271,11 @@ define <8 x i8> @vror_vv_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i8> @llvm.fshr.v8i8(<8 x i8> %a, <8 x i8> %a, <8 x i8> %b)
   ret <8 x i8> %x
 }
@@ -293,11 +293,11 @@ define <8 x i8> @vror_vx_v8i8(<8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <8 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <8 x i8> %b.head, <8 x i8> poison, <8 x i32> zeroinitializer
   %x = call <8 x i8> @llvm.fshr.v8i8(<8 x i8> %a, <8 x i8> %a, <8 x i8> %b.splat)
@@ -313,11 +313,11 @@ define <8 x i8> @vror_vi_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i8> @llvm.fshr.v8i8(<8 x i8> %a, <8 x i8> %a, <8 x i8> shufflevector(<8 x i8> insertelement(<8 x i8> poison, i8 1, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer))
   ret <8 x i8> %x
 }
@@ -331,11 +331,11 @@ define <8 x i8> @vror_vi_rotl_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i8> @llvm.fshl.v8i8(<8 x i8> %a, <8 x i8> %a, <8 x i8> shufflevector(<8 x i8> insertelement(<8 x i8> poison, i8 1, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer))
   ret <8 x i8> %x
 }
@@ -355,11 +355,11 @@ define <16 x i8> @vror_vv_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %x
 }
@@ -377,11 +377,11 @@ define <16 x i8> @vror_vx_v16i8(<16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <16 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <16 x i8> %b.head, <16 x i8> poison, <16 x i32> zeroinitializer
   %x = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %b.splat)
@@ -397,11 +397,11 @@ define <16 x i8> @vror_vi_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> shufflevector(<16 x i8> insertelement(<16 x i8> poison, i8 1, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer))
   ret <16 x i8> %x
 }
@@ -415,11 +415,11 @@ define <16 x i8> @vror_vi_rotl_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> shufflevector(<16 x i8> insertelement(<16 x i8> poison, i8 1, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer))
   ret <16 x i8> %x
 }
@@ -440,12 +440,12 @@ define <32 x i8> @vror_vv_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> %b)
   ret <32 x i8> %x
 }
@@ -464,12 +464,12 @@ define <32 x i8> @vror_vx_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a1, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <32 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <32 x i8> %b.head, <32 x i8> poison, <32 x i32> zeroinitializer
   %x = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> %b.splat)
@@ -486,12 +486,12 @@ define <32 x i8> @vror_vi_v32i8(<32 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> shufflevector(<32 x i8> insertelement(<32 x i8> poison, i8 1, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer))
   ret <32 x i8> %x
 }
@@ -506,12 +506,12 @@ define <32 x i8> @vror_vi_rotl_v32i8(<32 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> shufflevector(<32 x i8> insertelement(<32 x i8> poison, i8 1, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer))
   ret <32 x i8> %x
 }
@@ -532,12 +532,12 @@ define <64 x i8> @vror_vv_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 64
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 64
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> %b)
   ret <64 x i8> %x
 }
@@ -556,12 +556,12 @@ define <64 x i8> @vror_vx_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 64
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a1, 64
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <64 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <64 x i8> %b.head, <64 x i8> poison, <64 x i32> zeroinitializer
   %x = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> %b.splat)
@@ -578,12 +578,12 @@ define <64 x i8> @vror_vi_v64i8(<64 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 64
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 64
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> shufflevector(<64 x i8> insertelement(<64 x i8> poison, i8 1, i32 0), <64 x i8> poison, <64 x i32> zeroinitializer))
   ret <64 x i8> %x
 }
@@ -598,12 +598,12 @@ define <64 x i8> @vror_vi_rotl_v64i8(<64 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 64
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 64
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> shufflevector(<64 x i8> insertelement(<64 x i8> poison, i8 1, i32 0), <64 x i8> poison, <64 x i32> zeroinitializer))
   ret <64 x i8> %x
 }
@@ -623,11 +623,11 @@ define <1 x i16> @vror_vv_v1i16(<1 x i16> %a, <1 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i16> @llvm.fshr.v1i16(<1 x i16> %a, <1 x i16> %a, <1 x i16> %b)
   ret <1 x i16> %x
 }
@@ -645,11 +645,11 @@ define <1 x i16> @vror_vx_v1i16(<1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <1 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <1 x i16> %b.head, <1 x i16> poison, <1 x i32> zeroinitializer
   %x = call <1 x i16> @llvm.fshr.v1i16(<1 x i16> %a, <1 x i16> %a, <1 x i16> %b.splat)
@@ -665,11 +665,11 @@ define <1 x i16> @vror_vi_v1i16(<1 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i16> @llvm.fshr.v1i16(<1 x i16> %a, <1 x i16> %a, <1 x i16> shufflevector(<1 x i16> insertelement(<1 x i16> poison, i16 1, i32 0), <1 x i16> poison, <1 x i32> zeroinitializer))
   ret <1 x i16> %x
 }
@@ -683,11 +683,11 @@ define <1 x i16> @vror_vi_rotl_v1i16(<1 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i16> @llvm.fshl.v1i16(<1 x i16> %a, <1 x i16> %a, <1 x i16> shufflevector(<1 x i16> insertelement(<1 x i16> poison, i16 1, i32 0), <1 x i16> poison, <1 x i32> zeroinitializer))
   ret <1 x i16> %x
 }
@@ -707,11 +707,11 @@ define <2 x i16> @vror_vv_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %a, <2 x i16> %a, <2 x i16> %b)
   ret <2 x i16> %x
 }
@@ -729,11 +729,11 @@ define <2 x i16> @vror_vx_v2i16(<2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <2 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <2 x i16> %b.head, <2 x i16> poison, <2 x i32> zeroinitializer
   %x = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %a, <2 x i16> %a, <2 x i16> %b.splat)
@@ -749,11 +749,11 @@ define <2 x i16> @vror_vi_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %a, <2 x i16> %a, <2 x i16> shufflevector(<2 x i16> insertelement(<2 x i16> poison, i16 1, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer))
   ret <2 x i16> %x
 }
@@ -767,11 +767,11 @@ define <2 x i16> @vror_vi_rotl_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %a, <2 x i16> %a, <2 x i16> shufflevector(<2 x i16> insertelement(<2 x i16> poison, i16 1, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer))
   ret <2 x i16> %x
 }
@@ -791,11 +791,11 @@ define <4 x i16> @vror_vv_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %a, <4 x i16> %a, <4 x i16> %b)
   ret <4 x i16> %x
 }
@@ -813,11 +813,11 @@ define <4 x i16> @vror_vx_v4i16(<4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <4 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <4 x i16> %b.head, <4 x i16> poison, <4 x i32> zeroinitializer
   %x = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %a, <4 x i16> %a, <4 x i16> %b.splat)
@@ -833,11 +833,11 @@ define <4 x i16> @vror_vi_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %a, <4 x i16> %a, <4 x i16> shufflevector(<4 x i16> insertelement(<4 x i16> poison, i16 1, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer))
   ret <4 x i16> %x
 }
@@ -851,11 +851,11 @@ define <4 x i16> @vror_vi_rotl_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %a, <4 x i16> %a, <4 x i16> shufflevector(<4 x i16> insertelement(<4 x i16> poison, i16 1, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer))
   ret <4 x i16> %x
 }
@@ -875,11 +875,11 @@ define <8 x i16> @vror_vv_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %x
 }
@@ -897,11 +897,11 @@ define <8 x i16> @vror_vx_v8i16(<8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <8 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <8 x i16> %b.head, <8 x i16> poison, <8 x i32> zeroinitializer
   %x = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %b.splat)
@@ -917,11 +917,11 @@ define <8 x i16> @vror_vi_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> shufflevector(<8 x i16> insertelement(<8 x i16> poison, i16 1, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer))
   ret <8 x i16> %x
 }
@@ -935,11 +935,11 @@ define <8 x i16> @vror_vi_rotl_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> shufflevector(<8 x i16> insertelement(<8 x i16> poison, i16 1, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer))
   ret <8 x i16> %x
 }
@@ -959,11 +959,11 @@ define <16 x i16> @vror_vv_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a, <16 x i16> %a, <16 x i16> %b)
   ret <16 x i16> %x
 }
@@ -981,11 +981,11 @@ define <16 x i16> @vror_vx_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <16 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <16 x i16> %b.head, <16 x i16> poison, <16 x i32> zeroinitializer
   %x = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a, <16 x i16> %a, <16 x i16> %b.splat)
@@ -1001,11 +1001,11 @@ define <16 x i16> @vror_vi_v16i16(<16 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a, <16 x i16> %a, <16 x i16> shufflevector(<16 x i16> insertelement(<16 x i16> poison, i16 1, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer))
   ret <16 x i16> %x
 }
@@ -1019,11 +1019,11 @@ define <16 x i16> @vror_vi_rotl_v16i16(<16 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a, <16 x i16> %a, <16 x i16> shufflevector(<16 x i16> insertelement(<16 x i16> poison, i16 1, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer))
   ret <16 x i16> %x
 }
@@ -1044,12 +1044,12 @@ define <32 x i16> @vror_vv_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a, <32 x i16> %a, <32 x i16> %b)
   ret <32 x i16> %x
 }
@@ -1068,12 +1068,12 @@ define <32 x i16> @vror_vx_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a1, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a1, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <32 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <32 x i16> %b.head, <32 x i16> poison, <32 x i32> zeroinitializer
   %x = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a, <32 x i16> %a, <32 x i16> %b.splat)
@@ -1090,12 +1090,12 @@ define <32 x i16> @vror_vi_v32i16(<32 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a, <32 x i16> %a, <32 x i16> shufflevector(<32 x i16> insertelement(<32 x i16> poison, i16 1, i32 0), <32 x i16> poison, <32 x i32> zeroinitializer))
   ret <32 x i16> %x
 }
@@ -1110,12 +1110,12 @@ define <32 x i16> @vror_vi_rotl_v32i16(<32 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    li a0, 32
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    li a0, 32
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a, <32 x i16> %a, <32 x i16> shufflevector(<32 x i16> insertelement(<32 x i16> poison, i16 1, i32 0), <32 x i16> poison, <32 x i32> zeroinitializer))
   ret <32 x i16> %x
 }
@@ -1136,11 +1136,11 @@ define <1 x i32> @vror_vv_v1i32(<1 x i32> %a, <1 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i32> @llvm.fshr.v1i32(<1 x i32> %a, <1 x i32> %a, <1 x i32> %b)
   ret <1 x i32> %x
 }
@@ -1159,11 +1159,11 @@ define <1 x i32> @vror_vx_v1i32(<1 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <1 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <1 x i32> %b.head, <1 x i32> poison, <1 x i32> zeroinitializer
   %x = call <1 x i32> @llvm.fshr.v1i32(<1 x i32> %a, <1 x i32> %a, <1 x i32> %b.splat)
@@ -1179,11 +1179,11 @@ define <1 x i32> @vror_vi_v1i32(<1 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i32> @llvm.fshr.v1i32(<1 x i32> %a, <1 x i32> %a, <1 x i32> shufflevector(<1 x i32> insertelement(<1 x i32> poison, i32 1, i32 0), <1 x i32> poison, <1 x i32> zeroinitializer))
   ret <1 x i32> %x
 }
@@ -1197,11 +1197,11 @@ define <1 x i32> @vror_vi_rotl_v1i32(<1 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i32> @llvm.fshl.v1i32(<1 x i32> %a, <1 x i32> %a, <1 x i32> shufflevector(<1 x i32> insertelement(<1 x i32> poison, i32 1, i32 0), <1 x i32> poison, <1 x i32> zeroinitializer))
   ret <1 x i32> %x
 }
@@ -1222,11 +1222,11 @@ define <2 x i32> @vror_vv_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %a, <2 x i32> %a, <2 x i32> %b)
   ret <2 x i32> %x
 }
@@ -1245,11 +1245,11 @@ define <2 x i32> @vror_vx_v2i32(<2 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <2 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <2 x i32> %b.head, <2 x i32> poison, <2 x i32> zeroinitializer
   %x = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %a, <2 x i32> %a, <2 x i32> %b.splat)
@@ -1265,11 +1265,11 @@ define <2 x i32> @vror_vi_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %a, <2 x i32> %a, <2 x i32> shufflevector(<2 x i32> insertelement(<2 x i32> poison, i32 1, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer))
   ret <2 x i32> %x
 }
@@ -1283,11 +1283,11 @@ define <2 x i32> @vror_vi_rotl_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %a, <2 x i32> %a, <2 x i32> shufflevector(<2 x i32> insertelement(<2 x i32> poison, i32 1, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer))
   ret <2 x i32> %x
 }
@@ -1308,11 +1308,11 @@ define <4 x i32> @vror_vv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %x
 }
@@ -1331,11 +1331,11 @@ define <4 x i32> @vror_vx_v4i32(<4 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <4 x i32> %b.head, <4 x i32> poison, <4 x i32> zeroinitializer
   %x = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %b.splat)
@@ -1351,11 +1351,11 @@ define <4 x i32> @vror_vi_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> shufflevector(<4 x i32> insertelement(<4 x i32> poison, i32 1, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer))
   ret <4 x i32> %x
 }
@@ -1369,11 +1369,11 @@ define <4 x i32> @vror_vi_rotl_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> shufflevector(<4 x i32> insertelement(<4 x i32> poison, i32 1, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer))
   ret <4 x i32> %x
 }
@@ -1394,11 +1394,11 @@ define <8 x i32> @vror_vv_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a, <8 x i32> %a, <8 x i32> %b)
   ret <8 x i32> %x
 }
@@ -1417,11 +1417,11 @@ define <8 x i32> @vror_vx_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <8 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <8 x i32> %b.head, <8 x i32> poison, <8 x i32> zeroinitializer
   %x = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a, <8 x i32> %a, <8 x i32> %b.splat)
@@ -1437,11 +1437,11 @@ define <8 x i32> @vror_vi_v8i32(<8 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a, <8 x i32> %a, <8 x i32> shufflevector(<8 x i32> insertelement(<8 x i32> poison, i32 1, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer))
   ret <8 x i32> %x
 }
@@ -1455,11 +1455,11 @@ define <8 x i32> @vror_vi_rotl_v8i32(<8 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a, <8 x i32> %a, <8 x i32> shufflevector(<8 x i32> insertelement(<8 x i32> poison, i32 1, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer))
   ret <8 x i32> %x
 }
@@ -1480,11 +1480,11 @@ define <16 x i32> @vror_vv_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a, <16 x i32> %a, <16 x i32> %b)
   ret <16 x i32> %x
 }
@@ -1503,11 +1503,11 @@ define <16 x i32> @vror_vx_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <16 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <16 x i32> %b.head, <16 x i32> poison, <16 x i32> zeroinitializer
   %x = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a, <16 x i32> %a, <16 x i32> %b.splat)
@@ -1523,11 +1523,11 @@ define <16 x i32> @vror_vi_v16i32(<16 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a, <16 x i32> %a, <16 x i32> shufflevector(<16 x i32> insertelement(<16 x i32> poison, i32 1, i32 0), <16 x i32> poison, <16 x i32> zeroinitializer))
   ret <16 x i32> %x
 }
@@ -1541,11 +1541,11 @@ define <16 x i32> @vror_vi_rotl_v16i32(<16 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a, <16 x i32> %a, <16 x i32> shufflevector(<16 x i32> insertelement(<16 x i32> poison, i32 1, i32 0), <16 x i32> poison, <16 x i32> zeroinitializer))
   ret <16 x i32> %x
 }
@@ -1566,11 +1566,11 @@ define <1 x i64> @vror_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i64> @llvm.fshr.v1i64(<1 x i64> %a, <1 x i64> %a, <1 x i64> %b)
   ret <1 x i64> %x
 }
@@ -1589,11 +1589,11 @@ define <1 x i64> @vror_vx_v1i64(<1 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <1 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <1 x i64> %b.head, <1 x i64> poison, <1 x i32> zeroinitializer
   %x = call <1 x i64> @llvm.fshr.v1i64(<1 x i64> %a, <1 x i64> %a, <1 x i64> %b.splat)
@@ -1624,11 +1624,11 @@ define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i64> @llvm.fshr.v1i64(<1 x i64> %a, <1 x i64> %a, <1 x i64> shufflevector(<1 x i64> insertelement(<1 x i64> poison, i64 1, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer))
   ret <1 x i64> %x
 }
@@ -1657,11 +1657,11 @@ define <1 x i64> @vror_vi_rotl_v1i64(<1 x i64> %a) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 63
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
+; CHECK-ZVKB-NEXT:    ret
   %x = call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %a, <1 x i64> %a, <1 x i64> shufflevector(<1 x i64> insertelement(<1 x i64> poison, i64 1, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer))
   ret <1 x i64> %x
 }
@@ -1682,11 +1682,11 @@ define <2 x i64> @vror_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %b)
   ret <2 x i64> %x
 }
@@ -1705,11 +1705,11 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <2 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <2 x i64> %b.head, <2 x i64> poison, <2 x i32> zeroinitializer
   %x = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %b.splat)
@@ -1740,11 +1740,11 @@ define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> shufflevector(<2 x i64> insertelement(<2 x i64> poison, i64 1, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer))
   ret <2 x i64> %x
 }
@@ -1773,11 +1773,11 @@ define <2 x i64> @vror_vi_rotl_v2i64(<2 x i64> %a) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 63
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
+; CHECK-ZVKB-NEXT:    ret
   %x = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> shufflevector(<2 x i64> insertelement(<2 x i64> poison, i64 1, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer))
   ret <2 x i64> %x
 }
@@ -1798,11 +1798,11 @@ define <4 x i64> @vror_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a, <4 x i64> %a, <4 x i64> %b)
   ret <4 x i64> %x
 }
@@ -1821,11 +1821,11 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <4 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <4 x i64> %b.head, <4 x i64> poison, <4 x i32> zeroinitializer
   %x = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a, <4 x i64> %a, <4 x i64> %b.splat)
@@ -1856,11 +1856,11 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a, <4 x i64> %a, <4 x i64> shufflevector(<4 x i64> insertelement(<4 x i64> poison, i64 1, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer))
   ret <4 x i64> %x
 }
@@ -1889,11 +1889,11 @@ define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 63
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
+; CHECK-ZVKB-NEXT:    ret
   %x = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a, <4 x i64> %a, <4 x i64> shufflevector(<4 x i64> insertelement(<4 x i64> poison, i64 1, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer))
   ret <4 x i64> %x
 }
@@ -1914,11 +1914,11 @@ define <8 x i64> @vror_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_v8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_v8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a, <8 x i64> %a, <8 x i64> %b)
   ret <8 x i64> %x
 }
@@ -1937,11 +1937,11 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_v8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_v8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <8 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <8 x i64> %b.head, <8 x i64> poison, <8 x i32> zeroinitializer
   %x = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a, <8 x i64> %a, <8 x i64> %b.splat)
@@ -1972,11 +1972,11 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_v8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_v8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a, <8 x i64> %a, <8 x i64> shufflevector(<8 x i64> insertelement(<8 x i64> poison, i64 1, i32 0), <8 x i64> poison, <8 x i32> zeroinitializer))
   ret <8 x i64> %x
 }
@@ -2005,11 +2005,11 @@ define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_v8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 63
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_v8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
+; CHECK-ZVKB-NEXT:    ret
   %x = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a, <8 x i64> %a, <8 x i64> shufflevector(<8 x i64> insertelement(<8 x i64> poison, i64 1, i32 0), <8 x i64> poison, <8 x i32> zeroinitializer))
   ret <8 x i64> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index adcd676d3e058e..f716984599e3f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB,CHECK-ZVBB32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB,CHECK-ZVBB64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB,CHECK-ZVKB32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB,CHECK-ZVKB64
 
 define <vscale x 1 x i8> @vandn_vv_nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %y) {
 ; CHECK-LABEL: vandn_vv_nxv1i8:
@@ -12,11 +12,11 @@ define <vscale x 1 x i8> @vandn_vv_nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 1 x i8> %head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
   %a = xor <vscale x 1 x i8> %x, %splat
@@ -32,11 +32,11 @@ define <vscale x 1 x i8> @vandn_vv_swapped_nxv1i8(<vscale x 1 x i8> %x, <vscale
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 1 x i8> %head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
   %a = xor <vscale x 1 x i8> %x, %splat
@@ -52,11 +52,11 @@ define <vscale x 1 x i8> @vandn_vx_nxv1i8(i8 %x, <vscale x 1 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 1 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 1 x i8> %head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
@@ -72,11 +72,11 @@ define <vscale x 1 x i8> @vandn_vx_swapped_nxv1i8(i8 %x, <vscale x 1 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 1 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 1 x i8> %head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
@@ -92,11 +92,11 @@ define <vscale x 2 x i8> @vandn_vv_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
   %a = xor <vscale x 2 x i8> %x, %splat
@@ -112,11 +112,11 @@ define <vscale x 2 x i8> @vandn_vv_swapped_nxv2i8(<vscale x 2 x i8> %x, <vscale
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
   %a = xor <vscale x 2 x i8> %x, %splat
@@ -132,11 +132,11 @@ define <vscale x 2 x i8> @vandn_vx_nxv2i8(i8 %x, <vscale x 2 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 2 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -152,11 +152,11 @@ define <vscale x 2 x i8> @vandn_vx_swapped_nxv2i8(i8 %x, <vscale x 2 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 2 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -172,11 +172,11 @@ define <vscale x 4 x i8> @vandn_vv_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
   %a = xor <vscale x 4 x i8> %x, %splat
@@ -192,11 +192,11 @@ define <vscale x 4 x i8> @vandn_vv_swapped_nxv4i8(<vscale x 4 x i8> %x, <vscale
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
   %a = xor <vscale x 4 x i8> %x, %splat
@@ -212,11 +212,11 @@ define <vscale x 4 x i8> @vandn_vx_nxv4i8(i8 %x, <vscale x 4 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 4 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -232,11 +232,11 @@ define <vscale x 4 x i8> @vandn_vx_swapped_nxv4i8(i8 %x, <vscale x 4 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 4 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -252,11 +252,11 @@ define <vscale x 8 x i8> @vandn_vv_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
   %a = xor <vscale x 8 x i8> %x, %splat
@@ -272,11 +272,11 @@ define <vscale x 8 x i8> @vandn_vv_swapped_nxv8i8(<vscale x 8 x i8> %x, <vscale
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
   %a = xor <vscale x 8 x i8> %x, %splat
@@ -292,11 +292,11 @@ define <vscale x 8 x i8> @vandn_vx_nxv8i8(i8 %x, <vscale x 8 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 8 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -312,11 +312,11 @@ define <vscale x 8 x i8> @vandn_vx_swapped_nxv8i8(i8 %x, <vscale x 8 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 8 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -332,11 +332,11 @@ define <vscale x 16 x i8> @vandn_vv_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 16 x i8> %head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
   %a = xor <vscale x 16 x i8> %x, %splat
@@ -352,11 +352,11 @@ define <vscale x 16 x i8> @vandn_vv_swapped_nxv16i8(<vscale x 16 x i8> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 16 x i8> %head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
   %a = xor <vscale x 16 x i8> %x, %splat
@@ -372,11 +372,11 @@ define <vscale x 16 x i8> @vandn_vx_nxv16i8(i8 %x, <vscale x 16 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 16 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 16 x i8> %head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -392,11 +392,11 @@ define <vscale x 16 x i8> @vandn_vx_swapped_nxv16i8(i8 %x, <vscale x 16 x i8> %y
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 16 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 16 x i8> %head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -412,11 +412,11 @@ define <vscale x 32 x i8> @vandn_vv_nxv32i8(<vscale x 32 x i8> %x, <vscale x 32
 ; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 32 x i8> %head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
   %a = xor <vscale x 32 x i8> %x, %splat
@@ -432,11 +432,11 @@ define <vscale x 32 x i8> @vandn_vv_swapped_nxv32i8(<vscale x 32 x i8> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 32 x i8> %head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
   %a = xor <vscale x 32 x i8> %x, %splat
@@ -452,11 +452,11 @@ define <vscale x 32 x i8> @vandn_vx_nxv32i8(i8 %x, <vscale x 32 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 32 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 32 x i8> %head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
@@ -472,11 +472,11 @@ define <vscale x 32 x i8> @vandn_vx_swapped_nxv32i8(i8 %x, <vscale x 32 x i8> %y
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 32 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 32 x i8> %head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
@@ -492,11 +492,11 @@ define <vscale x 64 x i8> @vandn_vv_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64
 ; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 64 x i8> %head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
   %a = xor <vscale x 64 x i8> %x, %splat
@@ -512,11 +512,11 @@ define <vscale x 64 x i8> @vandn_vv_swapped_nxv64i8(<vscale x 64 x i8> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
   %splat = shufflevector <vscale x 64 x i8> %head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
   %a = xor <vscale x 64 x i8> %x, %splat
@@ -532,11 +532,11 @@ define <vscale x 64 x i8> @vandn_vx_nxv64i8(i8 %x, <vscale x 64 x i8> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 64 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 64 x i8> %head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
@@ -552,11 +552,11 @@ define <vscale x 64 x i8> @vandn_vx_swapped_nxv64i8(i8 %x, <vscale x 64 x i8> %y
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i8 %x, -1
   %head = insertelement <vscale x 64 x i8> poison, i8 %a, i32 0
   %splat = shufflevector <vscale x 64 x i8> %head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
@@ -572,11 +572,11 @@ define <vscale x 1 x i16> @vandn_vv_nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 1 x i16> %head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
   %a = xor <vscale x 1 x i16> %x, %splat
@@ -592,11 +592,11 @@ define <vscale x 1 x i16> @vandn_vv_swapped_nxv1i16(<vscale x 1 x i16> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 1 x i16> %head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
   %a = xor <vscale x 1 x i16> %x, %splat
@@ -612,11 +612,11 @@ define <vscale x 1 x i16> @vandn_vx_nxv1i16(i16 %x, <vscale x 1 x i16> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 1 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 1 x i16> %head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
@@ -632,11 +632,11 @@ define <vscale x 1 x i16> @vandn_vx_swapped_nxv1i16(i16 %x, <vscale x 1 x i16> %
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 1 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 1 x i16> %head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
@@ -652,11 +652,11 @@ define <vscale x 2 x i16> @vandn_vv_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
   %a = xor <vscale x 2 x i16> %x, %splat
@@ -672,11 +672,11 @@ define <vscale x 2 x i16> @vandn_vv_swapped_nxv2i16(<vscale x 2 x i16> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
   %a = xor <vscale x 2 x i16> %x, %splat
@@ -692,11 +692,11 @@ define <vscale x 2 x i16> @vandn_vx_nxv2i16(i16 %x, <vscale x 2 x i16> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 2 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -712,11 +712,11 @@ define <vscale x 2 x i16> @vandn_vx_swapped_nxv2i16(i16 %x, <vscale x 2 x i16> %
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 2 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -732,11 +732,11 @@ define <vscale x 4 x i16> @vandn_vv_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
   %a = xor <vscale x 4 x i16> %x, %splat
@@ -752,11 +752,11 @@ define <vscale x 4 x i16> @vandn_vv_swapped_nxv4i16(<vscale x 4 x i16> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
   %a = xor <vscale x 4 x i16> %x, %splat
@@ -772,11 +772,11 @@ define <vscale x 4 x i16> @vandn_vx_nxv4i16(i16 %x, <vscale x 4 x i16> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 4 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -792,11 +792,11 @@ define <vscale x 4 x i16> @vandn_vx_swapped_nxv4i16(i16 %x, <vscale x 4 x i16> %
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 4 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -812,11 +812,11 @@ define <vscale x 8 x i16> @vandn_vv_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
   %a = xor <vscale x 8 x i16> %x, %splat
@@ -832,11 +832,11 @@ define <vscale x 8 x i16> @vandn_vv_swapped_nxv8i16(<vscale x 8 x i16> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
   %a = xor <vscale x 8 x i16> %x, %splat
@@ -852,11 +852,11 @@ define <vscale x 8 x i16> @vandn_vx_nxv8i16(i16 %x, <vscale x 8 x i16> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 8 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -872,11 +872,11 @@ define <vscale x 8 x i16> @vandn_vx_swapped_nxv8i16(i16 %x, <vscale x 8 x i16> %
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 8 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -892,11 +892,11 @@ define <vscale x 16 x i16> @vandn_vv_nxv16i16(<vscale x 16 x i16> %x, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 16 x i16> %head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
   %a = xor <vscale x 16 x i16> %x, %splat
@@ -912,11 +912,11 @@ define <vscale x 16 x i16> @vandn_vv_swapped_nxv16i16(<vscale x 16 x i16> %x, <v
 ; CHECK-NEXT:    vand.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 16 x i16> %head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
   %a = xor <vscale x 16 x i16> %x, %splat
@@ -932,11 +932,11 @@ define <vscale x 16 x i16> @vandn_vx_nxv16i16(i16 %x, <vscale x 16 x i16> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 16 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 16 x i16> %head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
@@ -952,11 +952,11 @@ define <vscale x 16 x i16> @vandn_vx_swapped_nxv16i16(i16 %x, <vscale x 16 x i16
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 16 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 16 x i16> %head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
@@ -972,11 +972,11 @@ define <vscale x 32 x i16> @vandn_vv_nxv32i16(<vscale x 32 x i16> %x, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 32 x i16> %head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
   %a = xor <vscale x 32 x i16> %x, %splat
@@ -992,11 +992,11 @@ define <vscale x 32 x i16> @vandn_vv_swapped_nxv32i16(<vscale x 32 x i16> %x, <v
 ; CHECK-NEXT:    vand.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
   %splat = shufflevector <vscale x 32 x i16> %head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
   %a = xor <vscale x 32 x i16> %x, %splat
@@ -1012,11 +1012,11 @@ define <vscale x 32 x i16> @vandn_vx_nxv32i16(i16 %x, <vscale x 32 x i16> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 32 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 32 x i16> %head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
@@ -1032,11 +1032,11 @@ define <vscale x 32 x i16> @vandn_vx_swapped_nxv32i16(i16 %x, <vscale x 32 x i16
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i16 %x, -1
   %head = insertelement <vscale x 32 x i16> poison, i16 %a, i32 0
   %splat = shufflevector <vscale x 32 x i16> %head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
@@ -1052,11 +1052,11 @@ define <vscale x 1 x i32> @vandn_vv_nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 1 x i32> %head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
   %a = xor <vscale x 1 x i32> %x, %splat
@@ -1072,11 +1072,11 @@ define <vscale x 1 x i32> @vandn_vv_swapped_nxv1i32(<vscale x 1 x i32> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 1 x i32> %head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
   %a = xor <vscale x 1 x i32> %x, %splat
@@ -1092,11 +1092,11 @@ define <vscale x 1 x i32> @vandn_vx_nxv1i32(i32 %x, <vscale x 1 x i32> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 1 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 1 x i32> %head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
@@ -1112,11 +1112,11 @@ define <vscale x 1 x i32> @vandn_vx_swapped_nxv1i32(i32 %x, <vscale x 1 x i32> %
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 1 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 1 x i32> %head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
@@ -1132,11 +1132,11 @@ define <vscale x 2 x i32> @vandn_vv_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
   %a = xor <vscale x 2 x i32> %x, %splat
@@ -1152,11 +1152,11 @@ define <vscale x 2 x i32> @vandn_vv_swapped_nxv2i32(<vscale x 2 x i32> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
   %a = xor <vscale x 2 x i32> %x, %splat
@@ -1172,11 +1172,11 @@ define <vscale x 2 x i32> @vandn_vx_nxv2i32(i32 %x, <vscale x 2 x i32> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 2 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -1192,11 +1192,11 @@ define <vscale x 2 x i32> @vandn_vx_swapped_nxv2i32(i32 %x, <vscale x 2 x i32> %
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 2 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -1212,11 +1212,11 @@ define <vscale x 4 x i32> @vandn_vv_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %a = xor <vscale x 4 x i32> %x, %splat
@@ -1232,11 +1232,11 @@ define <vscale x 4 x i32> @vandn_vv_swapped_nxv4i32(<vscale x 4 x i32> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %a = xor <vscale x 4 x i32> %x, %splat
@@ -1252,11 +1252,11 @@ define <vscale x 4 x i32> @vandn_vx_nxv4i32(i32 %x, <vscale x 4 x i32> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -1272,11 +1272,11 @@ define <vscale x 4 x i32> @vandn_vx_swapped_nxv4i32(i32 %x, <vscale x 4 x i32> %
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -1292,11 +1292,11 @@ define <vscale x 8 x i32> @vandn_vv_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x
 ; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
   %a = xor <vscale x 8 x i32> %x, %splat
@@ -1312,11 +1312,11 @@ define <vscale x 8 x i32> @vandn_vv_swapped_nxv8i32(<vscale x 8 x i32> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
   %a = xor <vscale x 8 x i32> %x, %splat
@@ -1332,11 +1332,11 @@ define <vscale x 8 x i32> @vandn_vx_nxv8i32(i32 %x, <vscale x 8 x i32> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 8 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -1352,11 +1352,11 @@ define <vscale x 8 x i32> @vandn_vx_swapped_nxv8i32(i32 %x, <vscale x 8 x i32> %
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 8 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -1372,11 +1372,11 @@ define <vscale x 16 x i32> @vandn_vv_nxv16i32(<vscale x 16 x i32> %x, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 16 x i32> %head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
   %a = xor <vscale x 16 x i32> %x, %splat
@@ -1392,11 +1392,11 @@ define <vscale x 16 x i32> @vandn_vv_swapped_nxv16i32(<vscale x 16 x i32> %x, <v
 ; CHECK-NEXT:    vand.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
   %splat = shufflevector <vscale x 16 x i32> %head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
   %a = xor <vscale x 16 x i32> %x, %splat
@@ -1412,11 +1412,11 @@ define <vscale x 16 x i32> @vandn_vx_nxv16i32(i32 %x, <vscale x 16 x i32> %y) {
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 16 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 16 x i32> %head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -1432,11 +1432,11 @@ define <vscale x 16 x i32> @vandn_vx_swapped_nxv16i32(i32 %x, <vscale x 16 x i32
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_swapped_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_swapped_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %a = xor i32 %x, -1
   %head = insertelement <vscale x 16 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 16 x i32> %head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -1452,11 +1452,11 @@ define <vscale x 1 x i64> @vandn_vv_nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %a = xor <vscale x 1 x i64> %x, %splat
@@ -1472,11 +1472,11 @@ define <vscale x 1 x i64> @vandn_vv_swapped_nxv1i64(<vscale x 1 x i64> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %a = xor <vscale x 1 x i64> %x, %splat
@@ -1507,26 +1507,26 @@ define <vscale x 1 x i64> @vandn_vx_nxv1i64(i64 %x, <vscale x 1 x i64> %y) {
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_nxv1i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v9, v8
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_nxv1i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_nxv1i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_nxv1i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB64-NEXT:    ret
   %a = xor i64 %x, -1
   %head = insertelement <vscale x 1 x i64> poison, i64 %a, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1557,26 +1557,26 @@ define <vscale x 1 x i64> @vandn_vx_swapped_nxv1i64(i64 %x, <vscale x 1 x i64> %
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_swapped_nxv1i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v9, v8
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_swapped_nxv1i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_swapped_nxv1i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_swapped_nxv1i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB64-NEXT:    ret
   %a = xor i64 %x, -1
   %head = insertelement <vscale x 1 x i64> poison, i64 %a, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1592,11 +1592,11 @@ define <vscale x 2 x i64> @vandn_vv_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
   %a = xor <vscale x 2 x i64> %x, %splat
@@ -1612,11 +1612,11 @@ define <vscale x 2 x i64> @vandn_vv_swapped_nxv2i64(<vscale x 2 x i64> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
   %a = xor <vscale x 2 x i64> %x, %splat
@@ -1647,26 +1647,26 @@ define <vscale x 2 x i64> @vandn_vx_nxv2i64(i64 %x, <vscale x 2 x i64> %y) {
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_nxv2i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v10, v8
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_nxv2i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_nxv2i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_nxv2i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB64-NEXT:    ret
   %a = xor i64 %x, -1
   %head = insertelement <vscale x 2 x i64> poison, i64 %a, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -1697,26 +1697,26 @@ define <vscale x 2 x i64> @vandn_vx_swapped_nxv2i64(i64 %x, <vscale x 2 x i64> %
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_swapped_nxv2i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v10, v8
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_swapped_nxv2i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_swapped_nxv2i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_swapped_nxv2i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB64-NEXT:    ret
   %a = xor i64 %x, -1
   %head = insertelement <vscale x 2 x i64> poison, i64 %a, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -1732,11 +1732,11 @@ define <vscale x 4 x i64> @vandn_vv_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x
 ; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
   %a = xor <vscale x 4 x i64> %x, %splat
@@ -1752,11 +1752,11 @@ define <vscale x 4 x i64> @vandn_vv_swapped_nxv4i64(<vscale x 4 x i64> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
   %a = xor <vscale x 4 x i64> %x, %splat
@@ -1787,26 +1787,26 @@ define <vscale x 4 x i64> @vandn_vx_nxv4i64(i64 %x, <vscale x 4 x i64> %y) {
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_nxv4i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v12, v8
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_nxv4i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_nxv4i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_nxv4i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB64-NEXT:    ret
   %a = xor i64 %x, -1
   %head = insertelement <vscale x 4 x i64> poison, i64 %a, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -1837,26 +1837,26 @@ define <vscale x 4 x i64> @vandn_vx_swapped_nxv4i64(i64 %x, <vscale x 4 x i64> %
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_swapped_nxv4i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v12, v8
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_swapped_nxv4i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_swapped_nxv4i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_swapped_nxv4i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB64-NEXT:    ret
   %a = xor i64 %x, -1
   %head = insertelement <vscale x 4 x i64> poison, i64 %a, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -1872,11 +1872,11 @@ define <vscale x 8 x i64> @vandn_vv_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x
 ; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
   %a = xor <vscale x 8 x i64> %x, %splat
@@ -1892,11 +1892,11 @@ define <vscale x 8 x i64> @vandn_vv_swapped_nxv8i64(<vscale x 8 x i64> %x, <vsca
 ; CHECK-NEXT:    vand.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_swapped_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_swapped_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8
+; CHECK-ZVKB-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
   %a = xor <vscale x 8 x i64> %x, %splat
@@ -1927,26 +1927,26 @@ define <vscale x 8 x i64> @vandn_vx_nxv8i64(i64 %x, <vscale x 8 x i64> %y) {
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_nxv8i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v16, v8
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_nxv8i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_nxv8i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v16, v8
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_nxv8i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB64-NEXT:    ret
   %a = xor i64 %x, -1
   %head = insertelement <vscale x 8 x i64> poison, i64 %a, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
@@ -1977,26 +1977,26 @@ define <vscale x 8 x i64> @vandn_vx_swapped_nxv8i64(i64 %x, <vscale x 8 x i64> %
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_swapped_nxv8i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v16, v8
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_swapped_nxv8i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_swapped_nxv8i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v16, v8
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_swapped_nxv8i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0
+; CHECK-ZVKB64-NEXT:    ret
   %a = xor i64 %x, -1
   %head = insertelement <vscale x 8 x i64> poison, i64 %a, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
index d0e23f236b2f7a..c1e85802d3732d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB,CHECK-ZVBB32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB,CHECK-ZVBB64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB,CHECK-ZVKB32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB,CHECK-ZVKB64
 
 declare <vscale x 1 x i8> @llvm.vp.and.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
 declare <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
@@ -15,11 +15,11 @@ define <vscale x 1 x i8> @vandn_vv_vp_nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 -1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i8> @llvm.vp.and.nxv1i8(<vscale x 1 x i8> %not.a, <vscale x 1 x i8> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i8> %x
@@ -33,11 +33,11 @@ define <vscale x 1 x i8> @vandn_vv_vp_swapped_nxv1i8(<vscale x 1 x i8> %a, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 -1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i8> @llvm.vp.and.nxv1i8(<vscale x 1 x i8> %b, <vscale x 1 x i8> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i8> %x
@@ -51,11 +51,11 @@ define <vscale x 1 x i8> @vandn_vx_vp_nxv1i8(i8 %a, <vscale x 1 x i8> %b, <vscal
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i8 %a, -1
   %head.not.a = insertelement <vscale x 1 x i8> poison, i8 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 1 x i8> %head.not.a, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
@@ -74,11 +74,11 @@ define <vscale x 2 x i8> @vandn_vv_vp_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 -1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> %not.a, <vscale x 2 x i8> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i8> %x
@@ -92,11 +92,11 @@ define <vscale x 2 x i8> @vandn_vv_vp_swapped_nxv2i8(<vscale x 2 x i8> %a, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 -1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> %b, <vscale x 2 x i8> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i8> %x
@@ -110,11 +110,11 @@ define <vscale x 2 x i8> @vandn_vx_vp_nxv2i8(i8 %a, <vscale x 2 x i8> %b, <vscal
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i8 %a, -1
   %head.not.a = insertelement <vscale x 2 x i8> poison, i8 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 2 x i8> %head.not.a, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -133,11 +133,11 @@ define <vscale x 4 x i8> @vandn_vv_vp_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 -1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> %not.a, <vscale x 4 x i8> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i8> %x
@@ -151,11 +151,11 @@ define <vscale x 4 x i8> @vandn_vv_vp_swapped_nxv4i8(<vscale x 4 x i8> %a, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 -1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> %b, <vscale x 4 x i8> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i8> %x
@@ -169,11 +169,11 @@ define <vscale x 4 x i8> @vandn_vx_vp_nxv4i8(i8 %a, <vscale x 4 x i8> %b, <vscal
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i8 %a, -1
   %head.not.a = insertelement <vscale x 4 x i8> poison, i8 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 4 x i8> %head.not.a, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -192,11 +192,11 @@ define <vscale x 8 x i8> @vandn_vv_vp_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 -1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> %not.a, <vscale x 8 x i8> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %x
@@ -210,11 +210,11 @@ define <vscale x 8 x i8> @vandn_vv_vp_swapped_nxv8i8(<vscale x 8 x i8> %a, <vsca
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 -1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> %b, <vscale x 8 x i8> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %x
@@ -228,11 +228,11 @@ define <vscale x 8 x i8> @vandn_vx_vp_nxv8i8(i8 %a, <vscale x 8 x i8> %b, <vscal
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i8 %a, -1
   %head.not.a = insertelement <vscale x 8 x i8> poison, i8 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 8 x i8> %head.not.a, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -251,11 +251,11 @@ define <vscale x 16 x i8> @vandn_vv_vp_nxv16i8(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> %not.a, <vscale x 16 x i8> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i8> %x
@@ -269,11 +269,11 @@ define <vscale x 16 x i8> @vandn_vv_vp_swapped_nxv16i8(<vscale x 16 x i8> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i8> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i8> %x
@@ -287,11 +287,11 @@ define <vscale x 16 x i8> @vandn_vx_vp_nxv16i8(i8 %a, <vscale x 16 x i8> %b, <vs
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i8 %a, -1
   %head.not.a = insertelement <vscale x 16 x i8> poison, i8 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 16 x i8> %head.not.a, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -310,11 +310,11 @@ define <vscale x 32 x i8> @vandn_vv_vp_nxv32i8(<vscale x 32 x i8> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 -1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i8> @llvm.vp.and.nxv32i8(<vscale x 32 x i8> %not.a, <vscale x 32 x i8> %b, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i8> %x
@@ -328,11 +328,11 @@ define <vscale x 32 x i8> @vandn_vv_vp_swapped_nxv32i8(<vscale x 32 x i8> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 -1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i8> @llvm.vp.and.nxv32i8(<vscale x 32 x i8> %b, <vscale x 32 x i8> %not.a, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i8> %x
@@ -346,11 +346,11 @@ define <vscale x 32 x i8> @vandn_vx_vp_nxv32i8(i8 %a, <vscale x 32 x i8> %b, <vs
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i8 %a, -1
   %head.not.a = insertelement <vscale x 32 x i8> poison, i8 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 32 x i8> %head.not.a, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
@@ -369,11 +369,11 @@ define <vscale x 64 x i8> @vandn_vv_vp_nxv64i8(<vscale x 64 x i8> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 -1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer), <vscale x 64 x i1> %mask, i32 %evl)
   %x = call <vscale x 64 x i8> @llvm.vp.and.nxv64i8(<vscale x 64 x i8> %not.a, <vscale x 64 x i8> %b, <vscale x 64 x i1> %mask, i32 %evl)
   ret <vscale x 64 x i8> %x
@@ -387,11 +387,11 @@ define <vscale x 64 x i8> @vandn_vv_vp_swapped_nxv64i8(<vscale x 64 x i8> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 -1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer), <vscale x 64 x i1> %mask, i32 %evl)
   %x = call <vscale x 64 x i8> @llvm.vp.and.nxv64i8(<vscale x 64 x i8> %b, <vscale x 64 x i8> %not.a, <vscale x 64 x i1> %mask, i32 %evl)
   ret <vscale x 64 x i8> %x
@@ -405,11 +405,11 @@ define <vscale x 64 x i8> @vandn_vx_vp_nxv64i8(i8 %a, <vscale x 64 x i8> %b, <vs
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i8 %a, -1
   %head.not.a = insertelement <vscale x 64 x i8> poison, i8 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 64 x i8> %head.not.a, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
@@ -428,11 +428,11 @@ define <vscale x 1 x i16> @vandn_vv_vp_nxv1i16(<vscale x 1 x i16> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 -1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i16> @llvm.vp.and.nxv1i16(<vscale x 1 x i16> %not.a, <vscale x 1 x i16> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i16> %x
@@ -446,11 +446,11 @@ define <vscale x 1 x i16> @vandn_vv_vp_swapped_nxv1i16(<vscale x 1 x i16> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 -1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i16> @llvm.vp.and.nxv1i16(<vscale x 1 x i16> %b, <vscale x 1 x i16> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i16> %x
@@ -464,11 +464,11 @@ define <vscale x 1 x i16> @vandn_vx_vp_nxv1i16(i16 %a, <vscale x 1 x i16> %b, <v
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i16 %a, -1
   %head.not.a = insertelement <vscale x 1 x i16> poison, i16 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 1 x i16> %head.not.a, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
@@ -487,11 +487,11 @@ define <vscale x 2 x i16> @vandn_vv_vp_nxv2i16(<vscale x 2 x i16> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 -1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> %not.a, <vscale x 2 x i16> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i16> %x
@@ -505,11 +505,11 @@ define <vscale x 2 x i16> @vandn_vv_vp_swapped_nxv2i16(<vscale x 2 x i16> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 -1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i16> %x
@@ -523,11 +523,11 @@ define <vscale x 2 x i16> @vandn_vx_vp_nxv2i16(i16 %a, <vscale x 2 x i16> %b, <v
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i16 %a, -1
   %head.not.a = insertelement <vscale x 2 x i16> poison, i16 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 2 x i16> %head.not.a, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -546,11 +546,11 @@ define <vscale x 4 x i16> @vandn_vv_vp_nxv4i16(<vscale x 4 x i16> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 -1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> %not.a, <vscale x 4 x i16> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i16> %x
@@ -564,11 +564,11 @@ define <vscale x 4 x i16> @vandn_vv_vp_swapped_nxv4i16(<vscale x 4 x i16> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 -1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> %b, <vscale x 4 x i16> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i16> %x
@@ -582,11 +582,11 @@ define <vscale x 4 x i16> @vandn_vx_vp_nxv4i16(i16 %a, <vscale x 4 x i16> %b, <v
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i16 %a, -1
   %head.not.a = insertelement <vscale x 4 x i16> poison, i16 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 4 x i16> %head.not.a, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -605,11 +605,11 @@ define <vscale x 8 x i16> @vandn_vv_vp_nxv8i16(<vscale x 8 x i16> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> %not.a, <vscale x 8 x i16> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i16> %x
@@ -623,11 +623,11 @@ define <vscale x 8 x i16> @vandn_vv_vp_swapped_nxv8i16(<vscale x 8 x i16> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> %b, <vscale x 8 x i16> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i16> %x
@@ -641,11 +641,11 @@ define <vscale x 8 x i16> @vandn_vx_vp_nxv8i16(i16 %a, <vscale x 8 x i16> %b, <v
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i16 %a, -1
   %head.not.a = insertelement <vscale x 8 x i16> poison, i16 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 8 x i16> %head.not.a, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -664,11 +664,11 @@ define <vscale x 16 x i16> @vandn_vv_vp_nxv16i16(<vscale x 16 x i16> %a, <vscale
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 -1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> %not.a, <vscale x 16 x i16> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i16> %x
@@ -682,11 +682,11 @@ define <vscale x 16 x i16> @vandn_vv_vp_swapped_nxv16i16(<vscale x 16 x i16> %a,
 ; CHECK-NEXT:    vand.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 -1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> %b, <vscale x 16 x i16> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i16> %x
@@ -700,11 +700,11 @@ define <vscale x 16 x i16> @vandn_vx_vp_nxv16i16(i16 %a, <vscale x 16 x i16> %b,
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i16 %a, -1
   %head.not.a = insertelement <vscale x 16 x i16> poison, i16 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 16 x i16> %head.not.a, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
@@ -723,11 +723,11 @@ define <vscale x 32 x i16> @vandn_vv_vp_nxv32i16(<vscale x 32 x i16> %a, <vscale
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 -1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i16> @llvm.vp.and.nxv32i16(<vscale x 32 x i16> %not.a, <vscale x 32 x i16> %b, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i16> %x
@@ -741,11 +741,11 @@ define <vscale x 32 x i16> @vandn_vv_vp_swapped_nxv32i16(<vscale x 32 x i16> %a,
 ; CHECK-NEXT:    vand.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 -1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i16> @llvm.vp.and.nxv32i16(<vscale x 32 x i16> %b, <vscale x 32 x i16> %not.a, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i16> %x
@@ -759,11 +759,11 @@ define <vscale x 32 x i16> @vandn_vx_vp_nxv32i16(i16 %a, <vscale x 32 x i16> %b,
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i16 %a, -1
   %head.not.a = insertelement <vscale x 32 x i16> poison, i16 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 32 x i16> %head.not.a, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
@@ -782,11 +782,11 @@ define <vscale x 1 x i32> @vandn_vv_vp_nxv1i32(<vscale x 1 x i32> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 -1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i32> @llvm.vp.and.nxv1i32(<vscale x 1 x i32> %not.a, <vscale x 1 x i32> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i32> %x
@@ -800,11 +800,11 @@ define <vscale x 1 x i32> @vandn_vv_vp_swapped_nxv1i32(<vscale x 1 x i32> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 -1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i32> @llvm.vp.and.nxv1i32(<vscale x 1 x i32> %b, <vscale x 1 x i32> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i32> %x
@@ -818,11 +818,11 @@ define <vscale x 1 x i32> @vandn_vx_vp_nxv1i32(i32 %a, <vscale x 1 x i32> %b, <v
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i32 %a, -1
   %head.not.a = insertelement <vscale x 1 x i32> poison, i32 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 1 x i32> %head.not.a, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
@@ -841,11 +841,11 @@ define <vscale x 2 x i32> @vandn_vv_vp_nxv2i32(<vscale x 2 x i32> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 -1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> %not.a, <vscale x 2 x i32> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i32> %x
@@ -859,11 +859,11 @@ define <vscale x 2 x i32> @vandn_vv_vp_swapped_nxv2i32(<vscale x 2 x i32> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 -1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i32> %x
@@ -877,11 +877,11 @@ define <vscale x 2 x i32> @vandn_vx_vp_nxv2i32(i32 %a, <vscale x 2 x i32> %b, <v
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i32 %a, -1
   %head.not.a = insertelement <vscale x 2 x i32> poison, i32 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 2 x i32> %head.not.a, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -900,11 +900,11 @@ define <vscale x 4 x i32> @vandn_vv_vp_nxv4i32(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %not.a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i32> %x
@@ -918,11 +918,11 @@ define <vscale x 4 x i32> @vandn_vv_vp_swapped_nxv4i32(<vscale x 4 x i32> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i32> %x
@@ -936,11 +936,11 @@ define <vscale x 4 x i32> @vandn_vx_vp_nxv4i32(i32 %a, <vscale x 4 x i32> %b, <v
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i32 %a, -1
   %head.not.a = insertelement <vscale x 4 x i32> poison, i32 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 4 x i32> %head.not.a, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -959,11 +959,11 @@ define <vscale x 8 x i32> @vandn_vv_vp_nxv8i32(<vscale x 8 x i32> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> %not.a, <vscale x 8 x i32> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i32> %x
@@ -977,11 +977,11 @@ define <vscale x 8 x i32> @vandn_vv_vp_swapped_nxv8i32(<vscale x 8 x i32> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i32> %x
@@ -995,11 +995,11 @@ define <vscale x 8 x i32> @vandn_vx_vp_nxv8i32(i32 %a, <vscale x 8 x i32> %b, <v
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i32 %a, -1
   %head.not.a = insertelement <vscale x 8 x i32> poison, i32 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 8 x i32> %head.not.a, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -1018,11 +1018,11 @@ define <vscale x 16 x i32> @vandn_vv_vp_nxv16i32(<vscale x 16 x i32> %a, <vscale
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 -1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> %not.a, <vscale x 16 x i32> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i32> %x
@@ -1036,11 +1036,11 @@ define <vscale x 16 x i32> @vandn_vv_vp_swapped_nxv16i32(<vscale x 16 x i32> %a,
 ; CHECK-NEXT:    vand.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 -1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> %b, <vscale x 16 x i32> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i32> %x
@@ -1054,11 +1054,11 @@ define <vscale x 16 x i32> @vandn_vx_vp_nxv16i32(i32 %a, <vscale x 16 x i32> %b,
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vx_vp_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vx_vp_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = xor i32 %a, -1
   %head.not.a = insertelement <vscale x 16 x i32> poison, i32 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 16 x i32> %head.not.a, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -1077,11 +1077,11 @@ define <vscale x 1 x i64> @vandn_vv_vp_nxv1i64(<vscale x 1 x i64> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 -1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %not.a, <vscale x 1 x i64> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %x
@@ -1095,11 +1095,11 @@ define <vscale x 1 x i64> @vandn_vv_vp_swapped_nxv1i64(<vscale x 1 x i64> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v9, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 -1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %b, <vscale x 1 x i64> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %x
@@ -1129,27 +1129,27 @@ define <vscale x 1 x i64> @vandn_vx_vp_nxv1i64(i64 %a, <vscale x 1 x i64> %b, <v
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_vp_nxv1i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-ZVBB32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v8, v9, v0.t
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_vp_nxv1i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_vp_nxv1i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
+; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v9, v0.t
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_vp_nxv1i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB64-NEXT:    ret
   %not.a = xor i64 %a, -1
   %head.not.a = insertelement <vscale x 1 x i64> poison, i64 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 1 x i64> %head.not.a, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1168,11 +1168,11 @@ define <vscale x 2 x i64> @vandn_vv_vp_nxv2i64(<vscale x 2 x i64> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 -1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> %not.a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i64> %x
@@ -1186,11 +1186,11 @@ define <vscale x 2 x i64> @vandn_vv_vp_swapped_nxv2i64(<vscale x 2 x i64> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v10, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 -1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> %b, <vscale x 2 x i64> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i64> %x
@@ -1220,27 +1220,27 @@ define <vscale x 2 x i64> @vandn_vx_vp_nxv2i64(i64 %a, <vscale x 2 x i64> %b, <v
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_vp_nxv2i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-ZVBB32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v8, v10, v0.t
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_vp_nxv2i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_vp_nxv2i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
+; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v10, v0.t
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_vp_nxv2i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB64-NEXT:    ret
   %not.a = xor i64 %a, -1
   %head.not.a = insertelement <vscale x 2 x i64> poison, i64 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 2 x i64> %head.not.a, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -1259,11 +1259,11 @@ define <vscale x 4 x i64> @vandn_vv_vp_nxv4i64(<vscale x 4 x i64> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 -1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> %not.a, <vscale x 4 x i64> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i64> %x
@@ -1277,11 +1277,11 @@ define <vscale x 4 x i64> @vandn_vv_vp_swapped_nxv4i64(<vscale x 4 x i64> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v12, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 -1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> %b, <vscale x 4 x i64> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i64> %x
@@ -1311,27 +1311,27 @@ define <vscale x 4 x i64> @vandn_vx_vp_nxv4i64(i64 %a, <vscale x 4 x i64> %b, <v
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_vp_nxv4i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-ZVBB32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v8, v12, v0.t
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_vp_nxv4i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_vp_nxv4i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
+; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v12, v0.t
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_vp_nxv4i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB64-NEXT:    ret
   %not.a = xor i64 %a, -1
   %head.not.a = insertelement <vscale x 4 x i64> poison, i64 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 4 x i64> %head.not.a, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -1350,11 +1350,11 @@ define <vscale x 8 x i64> @vandn_vv_vp_nxv8i64(<vscale x 8 x i64> %a, <vscale x
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 -1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> %not.a, <vscale x 8 x i64> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i64> %x
@@ -1368,11 +1368,11 @@ define <vscale x 8 x i64> @vandn_vv_vp_swapped_nxv8i64(<vscale x 8 x i64> %a, <v
 ; CHECK-NEXT:    vand.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vandn_vv_vp_swapped_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vandn.vv v8, v16, v8, v0.t
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vandn_vv_vp_swapped_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
+; CHECK-ZVKB-NEXT:    ret
   %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 -1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> %b, <vscale x 8 x i64> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i64> %x
@@ -1402,27 +1402,27 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
 ; CHECK-RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB32-LABEL: vandn_vx_vp_nxv8i64:
-; CHECK-ZVBB32:       # %bb.0:
-; CHECK-ZVBB32-NEXT:    addi sp, sp, -16
-; CHECK-ZVBB32-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-ZVBB32-NEXT:    not a0, a0
-; CHECK-ZVBB32-NEXT:    not a1, a1
-; CHECK-ZVBB32-NEXT:    sw a1, 12(sp)
-; CHECK-ZVBB32-NEXT:    sw a0, 8(sp)
-; CHECK-ZVBB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVBB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVBB32-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-ZVBB32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-ZVBB32-NEXT:    vand.vv v8, v8, v16, v0.t
-; CHECK-ZVBB32-NEXT:    addi sp, sp, 16
-; CHECK-ZVBB32-NEXT:    ret
-;
-; CHECK-ZVBB64-LABEL: vandn_vx_vp_nxv8i64:
-; CHECK-ZVBB64:       # %bb.0:
-; CHECK-ZVBB64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-ZVBB64-NEXT:    vandn.vx v8, v8, a0, v0.t
-; CHECK-ZVBB64-NEXT:    ret
+; CHECK-ZVKB32-LABEL: vandn_vx_vp_nxv8i64:
+; CHECK-ZVKB32:       # %bb.0:
+; CHECK-ZVKB32-NEXT:    addi sp, sp, -16
+; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ZVKB32-NEXT:    not a0, a0
+; CHECK-ZVKB32-NEXT:    not a1, a1
+; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
+; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
+; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
+; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
+; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v16, v0.t
+; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
+; CHECK-ZVKB32-NEXT:    ret
+;
+; CHECK-ZVKB64-LABEL: vandn_vx_vp_nxv8i64:
+; CHECK-ZVKB64:       # %bb.0:
+; CHECK-ZVKB64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-ZVKB64-NEXT:    vandn.vx v8, v8, a0, v0.t
+; CHECK-ZVKB64-NEXT:    ret
   %not.a = xor i64 %a, -1
   %head.not.a = insertelement <vscale x 8 x i64> poison, i64 %not.a, i32 0
   %splat.not.a = shufflevector <vscale x 8 x i64> %head.not.a, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
index 394435f55cf20f..2d8b3647163ea0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-ZVBB
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-ZVBB
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-ZVKB
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-ZVKB
 
 declare <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>)
 
@@ -18,11 +18,11 @@ define <vscale x 1 x i8> @vrol_vv_nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8>
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> %b)
   ret <vscale x 1 x i8> %x
 }
@@ -40,11 +40,11 @@ define <vscale x 1 x i8> @vrol_vx_nxv1i8(<vscale x 1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 1 x i8> %b.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
   %x = call <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> %b.splat)
@@ -65,11 +65,11 @@ define <vscale x 2 x i8> @vrol_vv_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8>
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i8> @llvm.fshl.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> %b)
   ret <vscale x 2 x i8> %x
 }
@@ -87,11 +87,11 @@ define <vscale x 2 x i8> @vrol_vx_nxv2i8(<vscale x 2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 2 x i8> %b.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
   %x = call <vscale x 2 x i8> @llvm.fshl.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> %b.splat)
@@ -112,11 +112,11 @@ define <vscale x 4 x i8> @vrol_vv_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8>
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i8> @llvm.fshl.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> %b)
   ret <vscale x 4 x i8> %x
 }
@@ -134,11 +134,11 @@ define <vscale x 4 x i8> @vrol_vx_nxv4i8(<vscale x 4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i8> %b.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
   %x = call <vscale x 4 x i8> @llvm.fshl.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> %b.splat)
@@ -159,11 +159,11 @@ define <vscale x 8 x i8> @vrol_vv_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8>
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i8> @llvm.fshl.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b)
   ret <vscale x 8 x i8> %x
 }
@@ -181,11 +181,11 @@ define <vscale x 8 x i8> @vrol_vx_nxv8i8(<vscale x 8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 8 x i8> %b.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
   %x = call <vscale x 8 x i8> @llvm.fshl.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b.splat)
@@ -206,11 +206,11 @@ define <vscale x 16 x i8> @vrol_vv_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %x
 }
@@ -228,11 +228,11 @@ define <vscale x 16 x i8> @vrol_vx_nxv16i8(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 16 x i8> %b.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
   %x = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b.splat)
@@ -253,11 +253,11 @@ define <vscale x 32 x i8> @vrol_vv_nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 32 x i8> @llvm.fshl.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> %b)
   ret <vscale x 32 x i8> %x
 }
@@ -275,11 +275,11 @@ define <vscale x 32 x i8> @vrol_vx_nxv32i8(<vscale x 32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 32 x i8> %b.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
   %x = call <vscale x 32 x i8> @llvm.fshl.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> %b.splat)
@@ -300,11 +300,11 @@ define <vscale x 64 x i8> @vrol_vv_nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v16
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v16
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 64 x i8> @llvm.fshl.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
   ret <vscale x 64 x i8> %x
 }
@@ -322,11 +322,11 @@ define <vscale x 64 x i8> @vrol_vx_nxv64i8(<vscale x 64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 64 x i8> %b.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
   %x = call <vscale x 64 x i8> @llvm.fshl.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> %b.splat)
@@ -347,11 +347,11 @@ define <vscale x 1 x i16> @vrol_vv_nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i16> @llvm.fshl.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> %b)
   ret <vscale x 1 x i16> %x
 }
@@ -369,11 +369,11 @@ define <vscale x 1 x i16> @vrol_vx_nxv1i16(<vscale x 1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 1 x i16> %b.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
   %x = call <vscale x 1 x i16> @llvm.fshl.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> %b.splat)
@@ -394,11 +394,11 @@ define <vscale x 2 x i16> @vrol_vv_nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i16> @llvm.fshl.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> %b)
   ret <vscale x 2 x i16> %x
 }
@@ -416,11 +416,11 @@ define <vscale x 2 x i16> @vrol_vx_nxv2i16(<vscale x 2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 2 x i16> %b.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
   %x = call <vscale x 2 x i16> @llvm.fshl.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> %b.splat)
@@ -441,11 +441,11 @@ define <vscale x 4 x i16> @vrol_vv_nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i16> @llvm.fshl.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b)
   ret <vscale x 4 x i16> %x
 }
@@ -463,11 +463,11 @@ define <vscale x 4 x i16> @vrol_vx_nxv4i16(<vscale x 4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i16> %b.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
   %x = call <vscale x 4 x i16> @llvm.fshl.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b.splat)
@@ -488,11 +488,11 @@ define <vscale x 8 x i16> @vrol_vv_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   ret <vscale x 8 x i16> %x
 }
@@ -510,11 +510,11 @@ define <vscale x 8 x i16> @vrol_vx_nxv8i16(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 8 x i16> %b.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
   %x = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b.splat)
@@ -535,11 +535,11 @@ define <vscale x 16 x i16> @vrol_vv_nxv16i16(<vscale x 16 x i16> %a, <vscale x 1
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i16> @llvm.fshl.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> %b)
   ret <vscale x 16 x i16> %x
 }
@@ -557,11 +557,11 @@ define <vscale x 16 x i16> @vrol_vx_nxv16i16(<vscale x 16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 16 x i16> %b.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
   %x = call <vscale x 16 x i16> @llvm.fshl.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> %b.splat)
@@ -582,11 +582,11 @@ define <vscale x 32 x i16> @vrol_vv_nxv32i16(<vscale x 32 x i16> %a, <vscale x 3
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v16
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v16
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 32 x i16> @llvm.fshl.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
   ret <vscale x 32 x i16> %x
 }
@@ -604,11 +604,11 @@ define <vscale x 32 x i16> @vrol_vx_nxv32i16(<vscale x 32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 32 x i16> %b.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
   %x = call <vscale x 32 x i16> @llvm.fshl.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> %b.splat)
@@ -630,11 +630,11 @@ define <vscale x 1 x i32> @vrol_vv_nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> %b)
   ret <vscale x 1 x i32> %x
 }
@@ -664,11 +664,11 @@ define <vscale x 1 x i32> @vrol_vx_nxv1i32(<vscale x 1 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 1 x i32> %b.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
   %x = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> %b.splat)
@@ -690,11 +690,11 @@ define <vscale x 2 x i32> @vrol_vv_nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i32> @llvm.fshl.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> %b)
   ret <vscale x 2 x i32> %x
 }
@@ -724,11 +724,11 @@ define <vscale x 2 x i32> @vrol_vx_nxv2i32(<vscale x 2 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 2 x i32> %b.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
   %x = call <vscale x 2 x i32> @llvm.fshl.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> %b.splat)
@@ -750,11 +750,11 @@ define <vscale x 4 x i32> @vrol_vv_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %x
 }
@@ -784,11 +784,11 @@ define <vscale x 4 x i32> @vrol_vx_nxv4i32(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %x = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b.splat)
@@ -810,11 +810,11 @@ define <vscale x 8 x i32> @vrol_vv_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i32> @llvm.fshl.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b)
   ret <vscale x 8 x i32> %x
 }
@@ -844,11 +844,11 @@ define <vscale x 8 x i32> @vrol_vx_nxv8i32(<vscale x 8 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 8 x i32> %b.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
   %x = call <vscale x 8 x i32> @llvm.fshl.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b.splat)
@@ -870,11 +870,11 @@ define <vscale x 16 x i32> @vrol_vv_nxv16i32(<vscale x 16 x i32> %a, <vscale x 1
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v16
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v16
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i32> @llvm.fshl.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
   ret <vscale x 16 x i32> %x
 }
@@ -904,11 +904,11 @@ define <vscale x 16 x i32> @vrol_vx_nxv16i32(<vscale x 16 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v24, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 16 x i32> %b.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
   %x = call <vscale x 16 x i32> @llvm.fshl.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> %b.splat)
@@ -930,11 +930,11 @@ define <vscale x 1 x i64> @vrol_vv_nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i64> @llvm.fshl.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> %b)
   ret <vscale x 1 x i64> %x
 }
@@ -964,11 +964,11 @@ define <vscale x 1 x i64> @vrol_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v9, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <vscale x 1 x i64> %b.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %x = call <vscale x 1 x i64> @llvm.fshl.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> %b.splat)
@@ -990,11 +990,11 @@ define <vscale x 2 x i64> @vrol_vv_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %x
 }
@@ -1024,11 +1024,11 @@ define <vscale x 2 x i64> @vrol_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <vscale x 2 x i64> %b.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
   %x = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b.splat)
@@ -1050,11 +1050,11 @@ define <vscale x 4 x i64> @vrol_vv_nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
   ret <vscale x 4 x i64> %x
 }
@@ -1084,11 +1084,11 @@ define <vscale x 4 x i64> @vrol_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i64> %b.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
   %x = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b.splat)
@@ -1110,11 +1110,11 @@ define <vscale x 8 x i64> @vrol_vv_nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vv_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vv v8, v8, v16
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vv_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vv v8, v8, v16
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   ret <vscale x 8 x i64> %x
 }
@@ -1144,11 +1144,11 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vrol_vx_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vrol.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vrol_vx_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vrol.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <vscale x 8 x i64> %b.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
   %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> %b.splat)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index 0f13efde7e12a1..f8c1bb5194f11b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-ZVBB
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-ZVBB
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-ZVKB
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvkb -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-ZVKB
 
 declare <vscale x 1 x i8> @llvm.fshr.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>)
 declare <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>)
@@ -19,11 +19,11 @@ define <vscale x 1 x i8> @vror_vv_nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8>
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i8> @llvm.fshr.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> %b)
   ret <vscale x 1 x i8> %x
 }
@@ -41,11 +41,11 @@ define <vscale x 1 x i8> @vror_vx_nxv1i8(<vscale x 1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 1 x i8> %b.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
   %x = call <vscale x 1 x i8> @llvm.fshr.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> %b.splat)
@@ -61,11 +61,11 @@ define <vscale x 1 x i8> @vror_vi_nxv1i8(<vscale x 1 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i8> @llvm.fshr.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer))
   ret <vscale x 1 x i8> %x
 }
@@ -79,11 +79,11 @@ define <vscale x 1 x i8> @vror_vi_rotl_nxv1i8(<vscale x 1 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv1i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv1i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer))
   ret <vscale x 1 x i8> %x
 }
@@ -103,11 +103,11 @@ define <vscale x 2 x i8> @vror_vv_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8>
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i8> @llvm.fshr.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> %b)
   ret <vscale x 2 x i8> %x
 }
@@ -125,11 +125,11 @@ define <vscale x 2 x i8> @vror_vx_nxv2i8(<vscale x 2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 2 x i8> %b.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
   %x = call <vscale x 2 x i8> @llvm.fshr.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> %b.splat)
@@ -145,11 +145,11 @@ define <vscale x 2 x i8> @vror_vi_nxv2i8(<vscale x 2 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i8> @llvm.fshr.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer))
   ret <vscale x 2 x i8> %x
 }
@@ -163,11 +163,11 @@ define <vscale x 2 x i8> @vror_vi_rotl_nxv2i8(<vscale x 2 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv2i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv2i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i8> @llvm.fshl.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer))
   ret <vscale x 2 x i8> %x
 }
@@ -187,11 +187,11 @@ define <vscale x 4 x i8> @vror_vv_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8>
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i8> @llvm.fshr.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> %b)
   ret <vscale x 4 x i8> %x
 }
@@ -209,11 +209,11 @@ define <vscale x 4 x i8> @vror_vx_nxv4i8(<vscale x 4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i8> %b.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
   %x = call <vscale x 4 x i8> @llvm.fshr.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> %b.splat)
@@ -229,11 +229,11 @@ define <vscale x 4 x i8> @vror_vi_nxv4i8(<vscale x 4 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i8> @llvm.fshr.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer))
   ret <vscale x 4 x i8> %x
 }
@@ -247,11 +247,11 @@ define <vscale x 4 x i8> @vror_vi_rotl_nxv4i8(<vscale x 4 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv4i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv4i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i8> @llvm.fshl.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer))
   ret <vscale x 4 x i8> %x
 }
@@ -271,11 +271,11 @@ define <vscale x 8 x i8> @vror_vv_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8>
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i8> @llvm.fshr.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b)
   ret <vscale x 8 x i8> %x
 }
@@ -293,11 +293,11 @@ define <vscale x 8 x i8> @vror_vx_nxv8i8(<vscale x 8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 8 x i8> %b.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
   %x = call <vscale x 8 x i8> @llvm.fshr.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b.splat)
@@ -313,11 +313,11 @@ define <vscale x 8 x i8> @vror_vi_nxv8i8(<vscale x 8 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i8> @llvm.fshr.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i8> %x
 }
@@ -331,11 +331,11 @@ define <vscale x 8 x i8> @vror_vi_rotl_nxv8i8(<vscale x 8 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv8i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv8i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i8> @llvm.fshl.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i8> %x
 }
@@ -355,11 +355,11 @@ define <vscale x 16 x i8> @vror_vv_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %x
 }
@@ -377,11 +377,11 @@ define <vscale x 16 x i8> @vror_vx_nxv16i8(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 16 x i8> %b.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
   %x = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b.splat)
@@ -397,11 +397,11 @@ define <vscale x 16 x i8> @vror_vi_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
   ret <vscale x 16 x i8> %x
 }
@@ -415,11 +415,11 @@ define <vscale x 16 x i8> @vror_vi_rotl_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv16i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv16i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
   ret <vscale x 16 x i8> %x
 }
@@ -439,11 +439,11 @@ define <vscale x 32 x i8> @vror_vv_nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 32 x i8> @llvm.fshr.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> %b)
   ret <vscale x 32 x i8> %x
 }
@@ -461,11 +461,11 @@ define <vscale x 32 x i8> @vror_vx_nxv32i8(<vscale x 32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 32 x i8> %b.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
   %x = call <vscale x 32 x i8> @llvm.fshr.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> %b.splat)
@@ -481,11 +481,11 @@ define <vscale x 32 x i8> @vror_vi_nxv32i8(<vscale x 32 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 32 x i8> @llvm.fshr.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer))
   ret <vscale x 32 x i8> %x
 }
@@ -499,11 +499,11 @@ define <vscale x 32 x i8> @vror_vi_rotl_nxv32i8(<vscale x 32 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv32i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv32i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 32 x i8> @llvm.fshl.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer))
   ret <vscale x 32 x i8> %x
 }
@@ -523,11 +523,11 @@ define <vscale x 64 x i8> @vror_vv_nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v16
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v16
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 64 x i8> @llvm.fshr.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
   ret <vscale x 64 x i8> %x
 }
@@ -545,11 +545,11 @@ define <vscale x 64 x i8> @vror_vx_nxv64i8(<vscale x 64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
   %b.splat = shufflevector <vscale x 64 x i8> %b.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
   %x = call <vscale x 64 x i8> @llvm.fshr.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> %b.splat)
@@ -565,11 +565,11 @@ define <vscale x 64 x i8> @vror_vi_nxv64i8(<vscale x 64 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 64 x i8> @llvm.fshr.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer))
   ret <vscale x 64 x i8> %x
 }
@@ -583,11 +583,11 @@ define <vscale x 64 x i8> @vror_vi_rotl_nxv64i8(<vscale x 64 x i8> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv64i8:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 7
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv64i8:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 64 x i8> @llvm.fshl.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer))
   ret <vscale x 64 x i8> %x
 }
@@ -607,11 +607,11 @@ define <vscale x 1 x i16> @vror_vv_nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i16> @llvm.fshr.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> %b)
   ret <vscale x 1 x i16> %x
 }
@@ -629,11 +629,11 @@ define <vscale x 1 x i16> @vror_vx_nxv1i16(<vscale x 1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 1 x i16> %b.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
   %x = call <vscale x 1 x i16> @llvm.fshr.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> %b.splat)
@@ -649,11 +649,11 @@ define <vscale x 1 x i16> @vror_vi_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i16> @llvm.fshr.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer))
   ret <vscale x 1 x i16> %x
 }
@@ -667,11 +667,11 @@ define <vscale x 1 x i16> @vror_vi_rotl_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv1i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv1i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i16> @llvm.fshl.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer))
   ret <vscale x 1 x i16> %x
 }
@@ -691,11 +691,11 @@ define <vscale x 2 x i16> @vror_vv_nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i16> @llvm.fshr.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> %b)
   ret <vscale x 2 x i16> %x
 }
@@ -713,11 +713,11 @@ define <vscale x 2 x i16> @vror_vx_nxv2i16(<vscale x 2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 2 x i16> %b.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
   %x = call <vscale x 2 x i16> @llvm.fshr.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> %b.splat)
@@ -733,11 +733,11 @@ define <vscale x 2 x i16> @vror_vi_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i16> @llvm.fshr.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer))
   ret <vscale x 2 x i16> %x
 }
@@ -751,11 +751,11 @@ define <vscale x 2 x i16> @vror_vi_rotl_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv2i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv2i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i16> @llvm.fshl.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer))
   ret <vscale x 2 x i16> %x
 }
@@ -775,11 +775,11 @@ define <vscale x 4 x i16> @vror_vv_nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i16> @llvm.fshr.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b)
   ret <vscale x 4 x i16> %x
 }
@@ -797,11 +797,11 @@ define <vscale x 4 x i16> @vror_vx_nxv4i16(<vscale x 4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i16> %b.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
   %x = call <vscale x 4 x i16> @llvm.fshr.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b.splat)
@@ -817,11 +817,11 @@ define <vscale x 4 x i16> @vror_vi_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i16> @llvm.fshr.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer))
   ret <vscale x 4 x i16> %x
 }
@@ -835,11 +835,11 @@ define <vscale x 4 x i16> @vror_vi_rotl_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv4i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv4i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i16> @llvm.fshl.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer))
   ret <vscale x 4 x i16> %x
 }
@@ -859,11 +859,11 @@ define <vscale x 8 x i16> @vror_vv_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   ret <vscale x 8 x i16> %x
 }
@@ -881,11 +881,11 @@ define <vscale x 8 x i16> @vror_vx_nxv8i16(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 8 x i16> %b.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
   %x = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b.splat)
@@ -901,11 +901,11 @@ define <vscale x 8 x i16> @vror_vi_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i16> %x
 }
@@ -919,11 +919,11 @@ define <vscale x 8 x i16> @vror_vi_rotl_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv8i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv8i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i16> %x
 }
@@ -943,11 +943,11 @@ define <vscale x 16 x i16> @vror_vv_nxv16i16(<vscale x 16 x i16> %a, <vscale x 1
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i16> @llvm.fshr.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> %b)
   ret <vscale x 16 x i16> %x
 }
@@ -965,11 +965,11 @@ define <vscale x 16 x i16> @vror_vx_nxv16i16(<vscale x 16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 16 x i16> %b.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
   %x = call <vscale x 16 x i16> @llvm.fshr.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> %b.splat)
@@ -985,11 +985,11 @@ define <vscale x 16 x i16> @vror_vi_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i16> @llvm.fshr.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer))
   ret <vscale x 16 x i16> %x
 }
@@ -1003,11 +1003,11 @@ define <vscale x 16 x i16> @vror_vi_rotl_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv16i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv16i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i16> @llvm.fshl.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer))
   ret <vscale x 16 x i16> %x
 }
@@ -1027,11 +1027,11 @@ define <vscale x 32 x i16> @vror_vv_nxv32i16(<vscale x 32 x i16> %a, <vscale x 3
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v16
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v16
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 32 x i16> @llvm.fshr.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
   ret <vscale x 32 x i16> %x
 }
@@ -1049,11 +1049,11 @@ define <vscale x 32 x i16> @vror_vx_nxv32i16(<vscale x 32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
   %b.splat = shufflevector <vscale x 32 x i16> %b.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
   %x = call <vscale x 32 x i16> @llvm.fshr.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> %b.splat)
@@ -1069,11 +1069,11 @@ define <vscale x 32 x i16> @vror_vi_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 32 x i16> @llvm.fshr.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer))
   ret <vscale x 32 x i16> %x
 }
@@ -1087,11 +1087,11 @@ define <vscale x 32 x i16> @vror_vi_rotl_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv32i16:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 15
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv32i16:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 32 x i16> @llvm.fshl.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer))
   ret <vscale x 32 x i16> %x
 }
@@ -1112,11 +1112,11 @@ define <vscale x 1 x i32> @vror_vv_nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> %b)
   ret <vscale x 1 x i32> %x
 }
@@ -1146,11 +1146,11 @@ define <vscale x 1 x i32> @vror_vx_nxv1i32(<vscale x 1 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 1 x i32> %b.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
   %x = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> %b.splat)
@@ -1166,11 +1166,11 @@ define <vscale x 1 x i32> @vror_vi_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer))
   ret <vscale x 1 x i32> %x
 }
@@ -1184,11 +1184,11 @@ define <vscale x 1 x i32> @vror_vi_rotl_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv1i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv1i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer))
   ret <vscale x 1 x i32> %x
 }
@@ -1209,11 +1209,11 @@ define <vscale x 2 x i32> @vror_vv_nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i32> @llvm.fshr.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> %b)
   ret <vscale x 2 x i32> %x
 }
@@ -1243,11 +1243,11 @@ define <vscale x 2 x i32> @vror_vx_nxv2i32(<vscale x 2 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 2 x i32> %b.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
   %x = call <vscale x 2 x i32> @llvm.fshr.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> %b.splat)
@@ -1263,11 +1263,11 @@ define <vscale x 2 x i32> @vror_vi_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i32> @llvm.fshr.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
   ret <vscale x 2 x i32> %x
 }
@@ -1281,11 +1281,11 @@ define <vscale x 2 x i32> @vror_vi_rotl_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv2i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv2i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i32> @llvm.fshl.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
   ret <vscale x 2 x i32> %x
 }
@@ -1306,11 +1306,11 @@ define <vscale x 4 x i32> @vror_vv_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %x
 }
@@ -1340,11 +1340,11 @@ define <vscale x 4 x i32> @vror_vx_nxv4i32(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %x = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b.splat)
@@ -1360,11 +1360,11 @@ define <vscale x 4 x i32> @vror_vi_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
   ret <vscale x 4 x i32> %x
 }
@@ -1378,11 +1378,11 @@ define <vscale x 4 x i32> @vror_vi_rotl_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv4i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv4i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
   ret <vscale x 4 x i32> %x
 }
@@ -1403,11 +1403,11 @@ define <vscale x 8 x i32> @vror_vv_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i32> @llvm.fshr.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b)
   ret <vscale x 8 x i32> %x
 }
@@ -1437,11 +1437,11 @@ define <vscale x 8 x i32> @vror_vx_nxv8i32(<vscale x 8 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 8 x i32> %b.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
   %x = call <vscale x 8 x i32> @llvm.fshr.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b.splat)
@@ -1457,11 +1457,11 @@ define <vscale x 8 x i32> @vror_vi_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i32> @llvm.fshr.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i32> %x
 }
@@ -1475,11 +1475,11 @@ define <vscale x 8 x i32> @vror_vi_rotl_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv8i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv8i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i32> @llvm.fshl.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i32> %x
 }
@@ -1500,11 +1500,11 @@ define <vscale x 16 x i32> @vror_vv_nxv16i32(<vscale x 16 x i32> %a, <vscale x 1
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v16
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v16
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i32> @llvm.fshr.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
   ret <vscale x 16 x i32> %x
 }
@@ -1534,11 +1534,11 @@ define <vscale x 16 x i32> @vror_vx_nxv16i32(<vscale x 16 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v24, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 16 x i32> %b.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
   %x = call <vscale x 16 x i32> @llvm.fshr.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> %b.splat)
@@ -1554,11 +1554,11 @@ define <vscale x 16 x i32> @vror_vi_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i32> @llvm.fshr.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer))
   ret <vscale x 16 x i32> %x
 }
@@ -1572,11 +1572,11 @@ define <vscale x 16 x i32> @vror_vi_rotl_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv16i32:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 31
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv16i32:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 16 x i32> @llvm.fshl.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer))
   ret <vscale x 16 x i32> %x
 }
@@ -1597,11 +1597,11 @@ define <vscale x 1 x i64> @vror_vv_nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v9
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v9
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i64> @llvm.fshr.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> %b)
   ret <vscale x 1 x i64> %x
 }
@@ -1631,11 +1631,11 @@ define <vscale x 1 x i64> @vror_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v9, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <vscale x 1 x i64> %b.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %x = call <vscale x 1 x i64> @llvm.fshr.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> %b.splat)
@@ -1652,11 +1652,11 @@ define <vscale x 1 x i64> @vror_vi_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i64> @llvm.fshr.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer))
   ret <vscale x 1 x i64> %x
 }
@@ -1671,11 +1671,11 @@ define <vscale x 1 x i64> @vror_vi_rotl_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv1i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 63
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv1i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 1 x i64> @llvm.fshl.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer))
   ret <vscale x 1 x i64> %x
 }
@@ -1696,11 +1696,11 @@ define <vscale x 2 x i64> @vror_vv_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v10
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v10
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %x
 }
@@ -1730,11 +1730,11 @@ define <vscale x 2 x i64> @vror_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <vscale x 2 x i64> %b.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
   %x = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b.splat)
@@ -1751,11 +1751,11 @@ define <vscale x 2 x i64> @vror_vi_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
   ret <vscale x 2 x i64> %x
 }
@@ -1770,11 +1770,11 @@ define <vscale x 2 x i64> @vror_vi_rotl_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv2i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 63
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv2i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
   ret <vscale x 2 x i64> %x
 }
@@ -1795,11 +1795,11 @@ define <vscale x 4 x i64> @vror_vv_nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v12
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v12
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i64> @llvm.fshr.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
   ret <vscale x 4 x i64> %x
 }
@@ -1829,11 +1829,11 @@ define <vscale x 4 x i64> @vror_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i64> %b.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
   %x = call <vscale x 4 x i64> @llvm.fshr.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b.splat)
@@ -1850,11 +1850,11 @@ define <vscale x 4 x i64> @vror_vi_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i64> @llvm.fshr.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer))
   ret <vscale x 4 x i64> %x
 }
@@ -1869,11 +1869,11 @@ define <vscale x 4 x i64> @vror_vi_rotl_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv4i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 63
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv4i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer))
   ret <vscale x 4 x i64> %x
 }
@@ -1894,11 +1894,11 @@ define <vscale x 8 x i64> @vror_vv_nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vv_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vv v8, v8, v16
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vv_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vv v8, v8, v16
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i64> @llvm.fshr.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   ret <vscale x 8 x i64> %x
 }
@@ -1928,11 +1928,11 @@ define <vscale x 8 x i64> @vror_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
 ; CHECK-RV64-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vx_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vx v8, v8, a0
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vx_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vx v8, v8, a0
+; CHECK-ZVKB-NEXT:    ret
   %b.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
   %b.splat = shufflevector <vscale x 8 x i64> %b.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
   %x = call <vscale x 8 x i64> @llvm.fshr.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> %b.splat)
@@ -1949,11 +1949,11 @@ define <vscale x 8 x i64> @vror_vi_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 1
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i64> @llvm.fshr.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i64> %x
 }
@@ -1968,11 +1968,11 @@ define <vscale x 8 x i64> @vror_vi_rotl_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 ;
-; CHECK-ZVBB-LABEL: vror_vi_rotl_nxv8i64:
-; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-ZVBB-NEXT:    vror.vi v8, v8, 63
-; CHECK-ZVBB-NEXT:    ret
+; CHECK-ZVKB-LABEL: vror_vi_rotl_nxv8i64:
+; CHECK-ZVKB:       # %bb.0:
+; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
+; CHECK-ZVKB-NEXT:    ret
   %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i64> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsplats-zfa.ll b/llvm/test/CodeGen/RISCV/rvv/vsplats-zfa.ll
index 7225677e61f606..59be018efb857b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsplats-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsplats-zfa.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+zfh,+experimental-zfa,+zvfh,+v -target-abi ilp32d -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+zfh,+zfa,+zvfh,+v -target-abi ilp32d -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+zfh,+experimental-zfa,+zvfh,+v -target-abi lp64d -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+zfh,+zfa,+zvfh,+v -target-abi lp64d -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK
 
 define <vscale x 8 x half> @vsplat_f16_0p625() {
diff --git a/llvm/test/CodeGen/SystemZ/zos-intrinsics.ll b/llvm/test/CodeGen/SystemZ/zos-intrinsics.ll
new file mode 100644
index 00000000000000..b1f0c9e5b8e54f
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/zos-intrinsics.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple s390x-zos < %s | FileCheck %s
+
+define float @sqrt_ieee(float %x) {
+entry:
+  %res = call float @llvm.sqrt.f32(float %x)
+  ret float %res
+}
+
+define float @cos_ieee(float %x) {
+entry:
+  %res = call float @llvm.cos.f32(float %x)
+  ret float %res
+}
+
+define double @sin_ieee(double %x) {
+entry:
+  %res = call double @llvm.sin.f64(double %x)
+  ret double %res
+}
+
+define fp128 @exp2_ieee(fp128 %x) {
+entry:
+  %res = call fp128 @llvm.exp2.f128(fp128 %x)
+  ret fp128 %res
+}
+
+declare float @llvm.sqrt.f32(float)
+declare float @llvm.cos.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+; Check the calls in the ADA.
+; CHECK: .section ".ada"
+
+; Check that there is no call to sqrt.
+; CHECK-NOT:  .quad   R(@@WSQT@B)
+; CHECK-NOT:  .quad   V(@@WSQT@B)
+
+; Check that there is the correct library call.
+; CHECK:      .quad   R(@@FCOS@B)
+; CHECK-NEXT: .quad   V(@@FCOS@B)
+; CHECK:      .quad   R(@@SSIN@B)
+; CHECK-NEXT: .quad   V(@@SSIN@B)
+; CHECK:      .quad   R(@@LXP2@B)
+; CHECK-NEXT: .quad   V(@@LXP2@B)
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index a0ac03e9e862f2..45bb70ec44b737 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -1024,8 +1024,10 @@ middle.block:                                     ; preds = %vector.body
 define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
 ; CHECK-LABEL: DCT_mve7:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #72
@@ -1072,6 +1074,7 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vmov q4, q2
 ; CHECK-NEXT:    vmov q5, q2
 ; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmov q6, q2
 ; CHECK-NEXT:    vmov q1, q2
 ; CHECK-NEXT:    mov r12, r7
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
@@ -1080,16 +1083,20 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r12
-; CHECK-NEXT:    adds r6, r3, r5
+; CHECK-NEXT:    add.w r10, r3, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
-; CHECK-NEXT:    add.w r11, r6, r5
+; CHECK-NEXT:    add.w r11, r10, r5
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r11]
+; CHECK-NEXT:    vldrwt.u32 q0, [r10]
 ; CHECK-NEXT:    add.w r6, r11, r5
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q6, q0, q7
+; CHECK-NEXT:    vldrwt.u32 q0, [r11]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
@@ -1171,7 +1178,8 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #72
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r11, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
   %i = load i32, ptr %NumInputs, align 4
@@ -1346,6 +1354,7 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    adds r1, r0, #1
 ; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov q6, q3
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vmov q2, q3
@@ -1358,43 +1367,46 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r10
 ; CHECK-NEXT:    add.w r11, r3, r6
-; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
+; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
-; CHECK-NEXT:    vldrwt.u32 q1, [r11]
+; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
 ; CHECK-NEXT:    add.w r5, r11, r6
+; CHECK-NEXT:    sub.w r10, r10, #4
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q6, q1, q0
+; CHECK-NEXT:    vldrwt.u32 q1, [r11]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q7, q1, q0
+; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vmov q4, q2
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #56] @ 16-byte Reload
 ; CHECK-NEXT:    adds r7, r5, r6
-; CHECK-NEXT:    adds r5, r7, r6
-; CHECK-NEXT:    sub.w r10, r10, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r5, r6
+; CHECK-NEXT:    adds r5, r7, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    adds r7, r5, r6
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q2, q4
+; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    adds r5, r7, r6
+; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q4, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    add r5, r6
 ; CHECK-NEXT:    vpstt
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index d5f472be043216..d80dd5a673e20f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -1077,58 +1077,63 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
-; CHECK-NEXT:    vmov.f32 s25, s1
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
+; CHECK-NEXT:    vmov.f32 s24, s9
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s26, s6
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vmov.f32 s24, s9
+; CHECK-NEXT:    vmov.f32 s27, s10
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s26, s6
+; CHECK-NEXT:    vmov.f32 s25, s1
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s27, s10
-; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s24, s2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s27, s3
 ; CHECK-NEXT:    vmov.f32 s14, s0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s13, s8
 ; CHECK-NEXT:    vmov.f32 s15, s5
+; CHECK-NEXT:    vmov.f32 s13, s8
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s24, s2
-; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vmov.f32 s0, s16
-; CHECK-NEXT:    vmov.f32 s1, s28
-; CHECK-NEXT:    vmov.f32 s3, s17
 ; CHECK-NEXT:    vmov.f32 s25, s7
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s13, s1
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s4, s16
+; CHECK-NEXT:    vmov.f32 s5, s28
+; CHECK-NEXT:    vmov.f32 s7, s17
+; CHECK-NEXT:    vmov.f32 s1, s19
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s2, s31
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s26, s11
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s15, s30
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #32]
 ; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s30, s0
 ; CHECK-NEXT:    vmov.f32 s0, s2
 ; CHECK-NEXT:    vmov.f32 s1, s11
 ; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s18, s10
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s28, s8
 ; CHECK-NEXT:    vmov.f32 s31, s9
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s12, s29
 ; CHECK-NEXT:    vmov.f32 s29, s4
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #160]
@@ -1143,14 +1148,14 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmov.f32 s8, s1
 ; CHECK-NEXT:    vmov.f32 s11, s2
 ; CHECK-NEXT:    vmov.f32 s22, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s7, s9
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s9, s21
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s21, s27
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
diff --git a/llvm/test/CodeGen/WebAssembly/unreachable.ll b/llvm/test/CodeGen/WebAssembly/unreachable.ll
index ad1c90090ac58b..72f865842bdcaa 100644
--- a/llvm/test/CodeGen/WebAssembly/unreachable.ll
+++ b/llvm/test/CodeGen/WebAssembly/unreachable.ll
@@ -1,33 +1,97 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s
+; The assertions in this file were autogenerated by
+; utils/update_llc_test_checks.py, but were hand-edited to add the
+; "end_function" lines to prevent the tests from passing when there are
+; superfluous instructions at the end of a function. You can run
+; update_llc_test_checks.py again, but please keep the "end_function" lines
+; intact when you commit.
 
-; Test that LLVM unreachable instruction and trap intrinsic are lowered to
-; wasm unreachable
+; Wasm, to generate valid code, always internally sets `--trap-unreachable` to 1
+; and `--no-trap-after-noreturn` to 0, and these command lines options, if
+; explicitly given, are ignored. Various combinations of these options should
+; have no effect and should not generate invalid code.
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs --trap-unreachable | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -verify-machineinstrs --trap-unreachable | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs --trap-unreachable --no-trap-after-noreturn | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -verify-machineinstrs --trap-unreachable --no-trap-after-noreturn | FileCheck %s
 
 target triple = "wasm32-unknown-unknown"
 
-declare void @llvm.trap()
-declare void @llvm.debugtrap()
-declare void @abort()
 
-; CHECK-LABEL: f1:
-; CHECK: call abort{{$}}
-; CHECK: unreachable
-define i32 @f1() {
-  call void @abort()
-  unreachable
-}
+; Test that the LLVM trap and debug trap intrinsics are lowered to wasm
+; unreachable.
 
-; CHECK-LABEL: f2:
-; CHECK: unreachable
-define void @f2() {
+declare void @llvm.trap() cold noreturn nounwind
+declare void @llvm.debugtrap() nounwind
+
+define void @trap_ret_void() {
+; CHECK-LABEL: trap_ret_void:
+; CHECK:         .functype trap_ret_void () -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    # fallthrough-return
+; CHECK-NEXT:    end_function
   call void @llvm.trap()
   ret void
 }
 
-; CHECK-LABEL: f3:
-; CHECK: unreachable
-define void @f3() {
+define void @debugtrap_ret_void() {
+; CHECK-LABEL: debugtrap_ret_void:
+; CHECK:         .functype debugtrap_ret_void () -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    # fallthrough-return
+; CHECK-NEXT:    end_function
   call void @llvm.debugtrap()
   ret void
 }
+
+; LLVM trap followed by LLVM unreachable could become exactly one wasm
+; unreachable, but two are emitted currently.
+define void @trap_unreacheable() {
+; CHECK-LABEL: trap_unreacheable:
+; CHECK:         .functype trap_unreacheable () -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    end_function
+  call void @llvm.trap()
+  unreachable
+}
+
+
+; Test that LLVM unreachable instruction is lowered to wasm unreachable when
+; necessary to fulfill the wasm operand stack requirements.
+
+declare void @ext_func()
+declare i32 @ext_func_i32()
+declare void @ext_never_return() noreturn
+
+; LLVM IR's 'unreachable' is translated to Wasm 'unreachable'.
+define i32 @missing_ret_unreachable() {
+; CHECK-LABEL: missing_ret_unreachable:
+; CHECK:         .functype missing_ret_unreachable () -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call ext_func
+; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    end_function
+  call void @ext_func()
+  unreachable
+}
+
+; This is similar to the above test, but ensures wasm unreachable is emitted
+; This is similar to the above test, but the callee has a 'noreturn' attribute.    
+; There is an optimization that removes an 'unreachable' after a noreturn call,  
+; but Wasm backend doesn't use it and ignore `--no-trap-after-noreturn`, if      
+; given, to generate valid code. 
+define i32 @missing_ret_noreturn_unreachable() {
+; CHECK-LABEL: missing_ret_noreturn_unreachable:
+; CHECK:         .functype missing_ret_noreturn_unreachable () -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call ext_never_return
+; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    end_function
+  call void @ext_never_return()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/WebAssembly/userstack.ll b/llvm/test/CodeGen/WebAssembly/userstack.ll
index b92946d1a6af7f..f7bdcdd42ba131 100644
--- a/llvm/test/CodeGen/WebAssembly/userstack.ll
+++ b/llvm/test/CodeGen/WebAssembly/userstack.ll
@@ -550,6 +550,18 @@ define void @llvm_stack_builtins(i32 %alloc) noredzone {
  ret void
 }
 
+; Use of stacksave requires local SP definition even without dymamic alloca.
+; CHECK-LABEL: llvm_stacksave_noalloca:
+define void @llvm_stacksave_noalloca() noredzone {
+ ; CHECK: global.get $push[[L11:.+]]=, __stack_pointer{{$}}
+ %stack = call i8* @llvm.stacksave()
+
+ ; CHECK-NEXT: call use_i8_star, $pop[[L11:.+]]
+ call void @use_i8_star(i8* %stack)
+
+ ret void
+}
+
 ; Not actually using the alloca'd variables exposed an issue with register
 ; stackification, where copying the stack pointer into the frame pointer was
 ; moved after the stack pointer was updated for the dynamic alloca.
@@ -617,7 +629,7 @@ define void @copytoreg_fi(i1 %cond, ptr %b) {
 ; CHECK-32-NEXT:    i32.const $push4=, 1
 ; CHECK-32-NEXT:    i32.and $push7=, $pop8, $pop4
 ; CHECK-32-NEXT:    local.set 0, $pop7
-; CHECK-32-NEXT:  .LBB10_1: # %body
+; CHECK-32-NEXT:  # %body
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    loop # label0:
 ; CHECK-32-NEXT:    local.get $push9=, 2
@@ -645,7 +657,7 @@ define void @copytoreg_fi(i1 %cond, ptr %b) {
 ; CHECK-64-NEXT:    i32.const $push4=, 1
 ; CHECK-64-NEXT:    i32.and $push7=, $pop8, $pop4
 ; CHECK-64-NEXT:    local.set 0, $pop7
-; CHECK-64-NEXT:  .LBB10_1: # %body
+; CHECK-64-NEXT:  # %body
 ; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-64-NEXT:    loop # label0:
 ; CHECK-64-NEXT:    local.get $push9=, 2
diff --git a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
new file mode 100644
index 00000000000000..daa521d3917cdf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+
+define i32 @mask_add_sext_i32_i64(ptr %base, i32 %i) {
+; X86-LABEL: mask_add_sext_i32_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 12(%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: mask_add_sext_i32_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    sarl $24, %esi
+; X64-NEXT:    movslq %esi, %rax
+; X64-NEXT:    movl 12(%rdi,%rax,4), %eax
+; X64-NEXT:    retq
+  %mask = ashr i32 %i, 24
+  %offset = add i32 %mask, 3
+  %sext = sext i32 %offset to i64
+  %gep = getelementptr i32, ptr %base, i64 %sext
+  %ret = load i32, ptr %gep
+  ret i32 %ret
+}
+
+define i32 @mask_add_zext_i32_i64(ptr %base, i32 %i) {
+; X86-LABEL: mask_add_zext_i32_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movl 12(%eax,%ecx,4), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: mask_add_zext_i32_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $15, %esi
+; X64-NEXT:    movl 12(%rdi,%rsi,4), %eax
+; X64-NEXT:    retq
+  %mask = and i32 %i, 15
+  %offset = add i32 %mask, 3
+  %zext = zext i32 %offset to i64
+  %gep = getelementptr i32, ptr %base, i64 %zext
+  %ret = load i32, ptr %gep
+  ret i32 %ret
+}
+
+define i32 @mask_offset_scale_zext_i32_i64(ptr %base, i32 %i) {
+; X86-LABEL: mask_offset_scale_zext_i32_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $11, %ecx
+; X86-NEXT:    movl 48(%eax,%ecx), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: mask_offset_scale_zext_i32_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $65280, %esi # imm = 0xFF00
+; X64-NEXT:    movl 48(%rdi,%rsi,8), %eax
+; X64-NEXT:    retq
+  %mask = and i32 %i, 65280
+  %offset = or i32 %mask, 6
+  %scale = shl i32 %offset, 1
+  %idxprom = zext i32 %scale to i64
+  %arrayidx = getelementptr inbounds i32, ptr %base, i64 %idxprom
+  %load = load i32, ptr %arrayidx, align 4
+  ret i32 %load
+}
diff --git a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression.ll b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression.ll
new file mode 100644
index 00000000000000..66ba54f3e318e1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; Check there's no assert in the coalescer from implicit-def operands
+; on an IMPLICIT_DEF.
+
+
+define i1 @_ZN4llvm8LLParser17parseDIEnumeratorERPNS_6MDNodeEb(i32 %arg) {
+; CHECK-LABEL: _ZN4llvm8LLParser17parseDIEnumeratorERPNS_6MDNodeEb:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    jne .LBB0_4
+; CHECK-NEXT:  # %bb.3: # %if.end.i.i.preheader
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:  .LBB0_4: # %if.then.i.i
+; CHECK-NEXT:    movb $1, %dl
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    je .LBB0_6
+; CHECK-NEXT:  .LBB0_7: # %do.end
+; CHECK-NEXT:    movq %rcx, 0
+; CHECK-NEXT:    movb %al, 0
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    jne .LBB0_7
+; CHECK-NEXT:  .LBB0_6: # %if.then8
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+entry:
+  switch i32 %arg, label %if.then.i.i [
+    i32 1, label %"_ZN4llvm8LLParser17parseMDFieldsImplIZNS0_17parseDIEnumeratorERPNS_6MDNodeEbE3$_0EEbT_RNS_5SMLocE.exit"
+    i32 0, label %if.end.i.i.preheader
+  ]
+
+if.end.i.i.preheader:                             ; preds = %entry
+  br label %if.then.i.i
+
+if.then.i.i:                                      ; preds = %if.end.i.i.preheader, %entry
+  %i = phi i64 [ 0, %entry ], [ 1, %if.end.i.i.preheader ]
+  %i1 = phi i8 [ 0, %entry ], [ 1, %if.end.i.i.preheader ]
+  br label %"_ZN4llvm8LLParser17parseMDFieldsImplIZNS0_17parseDIEnumeratorERPNS_6MDNodeEbE3$_0EEbT_RNS_5SMLocE.exit"
+
+"_ZN4llvm8LLParser17parseMDFieldsImplIZNS0_17parseDIEnumeratorERPNS_6MDNodeEbE3$_0EEbT_RNS_5SMLocE.exit": ; preds = %if.then.i.i, %entry
+  %i2 = phi i64 [ 0, %entry ], [ %i, %if.then.i.i ]
+  %i3 = phi i8 [ 0, %entry ], [ %i1, %if.then.i.i ]
+  %i4 = phi i8 [ 0, %entry ], [ 1, %if.then.i.i ]
+  %tobool7.not = icmp eq i8 %i4, 0
+  br i1 %tobool7.not, label %if.then8, label %do.end
+
+if.then8:                                         ; preds = %"_ZN4llvm8LLParser17parseMDFieldsImplIZNS0_17parseDIEnumeratorERPNS_6MDNodeEbE3$_0EEbT_RNS_5SMLocE.exit"
+  ret i1 false
+
+do.end:                                           ; preds = %"_ZN4llvm8LLParser17parseMDFieldsImplIZNS0_17parseDIEnumeratorERPNS_6MDNodeEbE3$_0EEbT_RNS_5SMLocE.exit"
+  store i64 %i2, ptr null, align 8
+  store i8 %i3, ptr null, align 4
+  ret i1 false
+}
diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll
index 6965f6d7af27b5..99f26525d49e5b 100644
--- a/llvm/test/CodeGen/X86/combine-subo.ll
+++ b/llvm/test/CodeGen/X86/combine-subo.ll
@@ -4,9 +4,14 @@
 
 declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare { i8, i1 } @llvm.ssub.with.overflow.i8(i8, i8) nounwind readnone
+declare { i8, i1 } @llvm.usub.with.overflow.i8(i8, i8) nounwind readnone
+
 
 declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare { <4 x i8>, <4 x i1> } @llvm.ssub.with.overflow.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+declare { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> , <4 x i8>) nounwind readnone
 
 ; fold (ssub x, 0) -> x
 define i32 @combine_ssub_zero(i32 %a0, i32 %a1) {
@@ -148,3 +153,79 @@ define <4 x i32> @combine_vec_usub_negone(<4 x i32> %a0, <4 x i32> %a1) {
   %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
   ret <4 x i32> %4
 }
+
+define { i32, i1 } @combine_usub_nuw(i32 %a, i32 %b) {
+; CHECK-LABEL: combine_usub_nuw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    orl $-2147483648, %eax # imm = 0x80000000
+; CHECK-NEXT:    andl $2147483647, %esi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %esi, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    retq
+  %aa = or i32 %a, 2147483648
+  %bb = and i32 %b, 2147483647
+  %x = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %aa, i32 %bb)
+  ret { i32, i1 } %x
+}
+
+define { i8, i1 } @usub_always_overflow(i8 %x) nounwind {
+; CHECK-LABEL: usub_always_overflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    orb $64, %dil
+; CHECK-NEXT:    movb $63, %al
+; CHECK-NEXT:    subb %dil, %al
+; CHECK-NEXT:    setb %dl
+; CHECK-NEXT:    retq
+  %y = or i8 %x, 64
+  %a = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 63, i8 %y)
+  ret { i8, i1 } %a
+}
+
+define { i8, i1 } @ssub_always_overflow(i8 %x) nounwind {
+; CHECK-LABEL: ssub_always_overflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpb $30, %dil
+; CHECK-NEXT:    movl $29, %ecx
+; CHECK-NEXT:    cmovgel %edi, %ecx
+; CHECK-NEXT:    movb $-100, %al
+; CHECK-NEXT:    subb %cl, %al
+; CHECK-NEXT:    seto %dl
+; CHECK-NEXT:    retq
+  %c = icmp sgt i8 %x, 29
+  %y = select i1 %c, i8 %x, i8 29
+  %a = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 -100, i8 %y)
+  ret { i8, i1 } %a
+}
+
+define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
+; SSE-LABEL: always_usub_const_vector:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: always_usub_const_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    retq
+  %x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 0, i8 0, i8 0, i8 0>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>)
+  ret { <4 x i8>, <4 x i1> } %x
+}
+
+define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
+; SSE-LABEL: never_usub_const_vector:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = <127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: never_usub_const_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254]
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    retq
+  %x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 255, i8 255, i8 255, i8 255>, <4 x i8> <i8 128, i8 0, i8 255, i8 1>)
+  ret { <4 x i8>, <4 x i1> } %x
+}
diff --git a/llvm/test/CodeGen/X86/fast-isel-dbg-value-alloca.ll b/llvm/test/CodeGen/X86/fast-isel-dbg-value-alloca.ll
new file mode 100644
index 00000000000000..77f883ee9bd3a7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fast-isel-dbg-value-alloca.ll
@@ -0,0 +1,32 @@
+; RUN: llc -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -stop-after=finalize-isel %s -o - | \
+; RUN:    FileCheck %s
+
+define void @foo(ptr noalias nocapture %arg) !dbg !38 {
+  %k.debug = alloca ptr, align 8
+  store ptr %arg, ptr %k.debug, align 8, !dbg !70
+  call void @llvm.dbg.value(metadata ptr %k.debug, metadata !55, metadata !DIExpression(DW_OP_deref)), !dbg !70
+; CHECK: call void @llvm.dbg.value(metadata ptr %{{.*}}, metadata ![[VAR:.*]], metadata ![[EXPR:.*]])
+; CHECK: DBG_VALUE %stack.0{{.*}}, $noreg, ![[VAR]], ![[EXPR]]
+  ret void, !dbg !70
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.module.flags = !{!6, !7, !8, !9}
+!llvm.dbg.cu = !{!16}
+
+!6 = !{i32 7, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 8, !"PIC Level", i32 2}
+!16 = distinct !DICompileUnit(language: DW_LANG_Swift, file: !17, producer: "blah", isOptimized: false, runtimeVersion: 5, emissionKind: FullDebug, sysroot: "blah", sdk: "blah")
+!17 = !DIFile(filename: "blah", directory: "blah")
+!38 = distinct !DISubprogram(name: "blah", linkageName: "$blah", scope: !17, file: !17, line: 34, type: !39, scopeLine: 34, spFlags: DISPFlagDefinition, unit: !16, retainedNodes: !43)
+!39 = !DISubroutineType(types: !40)
+!40 = !{!41, !41}
+!41 = !DICompositeType(tag: DW_TAG_structure_type, name: "blah")
+!43 = !{!49, !55}
+!49 = !DILocalVariable(name: "x", arg: 1, scope: !38, file: !17, line: 34, type: !41)
+!55 = !DILocalVariable(name: "k", scope: !56, file: !17, line: 36, type: !41)
+!56 = distinct !DILexicalBlock(scope: !38, file: !17, line: 36, column: 9)
+!70 = !DILocation(line: 36, column: 9, scope: !56)
diff --git a/llvm/test/CodeGen/X86/gc-empty-basic-blocks.ll b/llvm/test/CodeGen/X86/gc-empty-basic-blocks.ll
index f737e43d3b816a..54ed34b2eae4e5 100644
--- a/llvm/test/CodeGen/X86/gc-empty-basic-blocks.ll
+++ b/llvm/test/CodeGen/X86/gc-empty-basic-blocks.ll
@@ -1,6 +1,8 @@
-;; This test verifies that -gc-empty-basic-blocks removes empty blocks.
+;; This test verifies that -gc-empty-basic-blocks removes regular empty blocks
+;; but does not remove empty blocks which have their address taken.
 ; RUN: llc < %s -mtriple=x86_64 -O0 -gc-empty-basic-blocks | FileCheck %s
 
+;; This function has a regular empty block.
 define void @foo(i1 zeroext %0) nounwind {
   br i1 %0, label %2, label %empty_block
 
@@ -10,7 +12,7 @@ define void @foo(i1 zeroext %0) nounwind {
 ; CHECK-NEXT:    jmp .LBB0_3
 
 2:                                               ; preds = %1
-  %3 = call i32 @bar()
+  %3 = call i32 @baz()
   br label %4
 
 ; CHECK-LABEL: .LBB0_1:
@@ -19,8 +21,8 @@ define void @foo(i1 zeroext %0) nounwind {
 empty_block:                                     ; preds = %1
   unreachable
 
-; CHECK-NOT: %empty_block
-; CHECK-NOT: .LBB0_2
+; CHECK-NOT:     %empty_block
+; CHECK-NOT:   .LBB0_2
 
 4:                                               ; preds = %2, %empty_block
   ret void
@@ -30,4 +32,23 @@ empty_block:                                     ; preds = %1
 
 }
 
-declare i32 @bar()
+;; This function has an empty block which has its address taken. Check that it
+;; is not removed by -gc-empty-basic-blocks.
+define void @bar(i1 zeroext %0) nounwind {
+entry:
+  %1 = select i1 %0, ptr blockaddress(@bar, %empty_block), ptr blockaddress(@bar, %bb2) ; <ptr> [#uses=1]
+  indirectbr ptr %1, [label %empty_block, label %bb2]
+
+; CHECK-LABEL: bar:
+
+empty_block:                                                ; preds = %entry
+  unreachable
+
+; CHECK-LABEL: .LBB1_1: # %empty_block
+
+bb2:                                                ; preds = %entry
+  %2 = call i32 @baz()
+  ret void
+}
+
+declare i32 @baz()
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
index 0817211f50937e..c997d314a50ae4 100644
--- a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
@@ -341,3 +341,39 @@ entry:
   store i32 %i, ptr addrspace(272) %s, align 8
   ret void
 }
+
+define i64 @test_load_sptr32_zext_i64(ptr addrspace(270) %i) {
+; ALL-LABEL: test_load_sptr32_zext_i64:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ALL-NEXT:    movl (%eax), %eax
+; ALL-NEXT:    xorl %edx, %edx
+; ALL-NEXT:    retl
+entry:
+  %0 = load i32, ptr addrspace(270) %i, align 4
+  %1 = zext i32 %0 to i64
+  ret i64 %1
+}
+
+define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
+; CHECK-LABEL: test_store_sptr32_trunc_i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    movb %cl, (%eax)
+; CHECK-NEXT:    retl
+;
+; CHECK-O0-LABEL: test_store_sptr32_trunc_i1:
+; CHECK-O0:       # %bb.0: # %entry
+; CHECK-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-O0-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-O0-NEXT:    andl $1, %ecx
+; CHECK-O0-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-O0-NEXT:    movb %cl, (%eax)
+; CHECK-O0-NEXT:    retl
+entry:
+  %0 = trunc i32 %i to i1
+  store i1 %0, ptr addrspace(270) %s
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
index e88a494908701d..2dd6011f36b775 100644
--- a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
@@ -256,3 +256,35 @@ entry:
   store i32 %i, ptr addrspace(272) %s, align 8
   ret void
 }
+
+define i64 @test_load_sptr32_zext_i64(ptr addrspace(270) %i) {
+; CHECK-LABEL: test_load_sptr32_zext_i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movslq %ecx, %rax
+; CHECK-NEXT:    movq (%rax), %rax
+; CHECK-NEXT:    retq
+;
+; CHECK-O0-LABEL: test_load_sptr32_zext_i64:
+; CHECK-O0:       # %bb.0: # %entry
+; CHECK-O0-NEXT:    movslq %ecx, %rax
+; CHECK-O0-NEXT:    movl (%rax), %eax
+; CHECK-O0-NEXT:    movl %eax, %eax
+; CHECK-O0-NEXT:    # kill: def $rax killed $eax
+; CHECK-O0-NEXT:    retq
+entry:
+  %0 = load i32, ptr addrspace(270) %i, align 4
+  %1 = zext i32 %0 to i64
+  ret i64 %1
+}
+
+define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
+; ALL-LABEL: test_store_sptr32_trunc_i1:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    movslq %ecx, %rax
+; ALL-NEXT:    movl %edx, (%rax)
+; ALL-NEXT:    retq
+entry:
+  %0 = trunc i32 %i to i1
+  store i1 %0, ptr addrspace(270) %s
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/or-with-overflow.ll b/llvm/test/CodeGen/X86/or-with-overflow.ll
index 4440485af54bba..b3ffa209bc7004 100644
--- a/llvm/test/CodeGen/X86/or-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/or-with-overflow.ll
@@ -161,19 +161,13 @@ define i32 @or_i32_rr(i32 %0, i32 %1) {
 define i64 @or_i64_ri(i64 %0, i64 %1) nounwind {
 ; X86-LABEL: or_i64_ri:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $17, %ecx
-; X86-NEXT:    cmpl $1, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    jl .LBB6_2
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    js .LBB6_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl $17, %eax
 ; X86-NEXT:  .LBB6_2:
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_i64_ri:
diff --git a/llvm/test/CodeGen/X86/pr67333.ll b/llvm/test/CodeGen/X86/pr67333.ll
new file mode 100644
index 00000000000000..64c7f4fb143bfe
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr67333.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 {
+; CHECK-LABEL: SHA256_Compress_Generic:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movbel 0, %eax
+; CHECK-NEXT:    movbel 12(%rdi), %ecx
+; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128]
+; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm0
+; CHECK-NEXT:    vpslld $15, %xmm2, %xmm3
+; CHECK-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm3
+; CHECK-NEXT:    vpslld $13, %xmm2, %xmm4
+; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %ecx, %xmm3
+; CHECK-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrld $17, %xmm1, %xmm0
+; CHECK-NEXT:    vpslld $15, %xmm1, %xmm3
+; CHECK-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vpsrld $19, %xmm1, %xmm3
+; CHECK-NEXT:    vpslld $13, %xmm1, %xmm4
+; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm3
+; CHECK-NEXT:    vpslld $15, %xmm0, %xmm4
+; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm4
+; CHECK-NEXT:    vpslld $13, %xmm0, %xmm5
+; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm3
+; CHECK-NEXT:    vpslld $15, %xmm0, %xmm4
+; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm4
+; CHECK-NEXT:    vpslld $13, %xmm0, %xmm5
+; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm4
+; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm3
+; CHECK-NEXT:    vpslld $15, %xmm2, %xmm4
+; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm4
+; CHECK-NEXT:    vpslld $13, %xmm2, %xmm5
+; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; CHECK-NEXT:    vpsrld $10, %xmm2, %xmm2
+; CHECK-NEXT:    vpxor %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vpsrld $17, %xmm1, %xmm2
+; CHECK-NEXT:    vpslld $15, %xmm1, %xmm4
+; CHECK-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; CHECK-NEXT:    vpsrld $19, %xmm1, %xmm4
+; CHECK-NEXT:    vpslld $13, %xmm1, %xmm5
+; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; CHECK-NEXT:    vpsrld $10, %xmm1, %xmm4
+; CHECK-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm3
+; CHECK-NEXT:    vpslld $15, %xmm2, %xmm4
+; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm4
+; CHECK-NEXT:    vpslld $13, %xmm2, %xmm5
+; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; CHECK-NEXT:    vpsrld $10, %xmm2, %xmm2
+; CHECK-NEXT:    vpxor %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm2
+; CHECK-NEXT:    vpslld $15, %xmm0, %xmm3
+; CHECK-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm3
+; CHECK-NEXT:    vpslld $13, %xmm0, %xmm4
+; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm3
+; CHECK-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vpsllq $32, %xmm1, %xmm3
+; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-NEXT:    vmovdqu %ymm0, 132(%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i32, ptr null, align 4
+  %1 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #3
+  %arrayidx14 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 3
+  %2 = load i32, ptr %arrayidx14, align 4
+  %3 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %2) #3
+  %4 = insertelement <2 x i32> zeroinitializer, i32 %1, i64 1
+  %5 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 15, i32 15>)
+  %6 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 13, i32 13>)
+  %7 = xor <2 x i32> %5, %6
+  %8 = lshr <2 x i32> %4, zeroinitializer
+  %9 = xor <2 x i32> %7, %8
+  %10 = insertelement <2 x i32> zeroinitializer, i32 %3, i64 0
+  %11 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %10, <2 x i32> <i32 1, i32 2>
+  %12 = add <2 x i32> %11, %9
+  %13 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 15, i32 15>)
+  %14 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 13, i32 13>)
+  %15 = xor <2 x i32> %13, %14
+  %16 = lshr <2 x i32> %12, zeroinitializer
+  %17 = xor <2 x i32> %15, %16
+  %18 = add <2 x i32> %4, %17
+  %19 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 15, i32 15>)
+  %20 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 13, i32 13>)
+  %21 = xor <2 x i32> %19, %20
+  %22 = lshr <2 x i32> %18, <i32 10, i32 10>
+  %23 = xor <2 x i32> %21, %22
+  %24 = add <2 x i32> %4, %23
+  %25 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 15, i32 15>)
+  %26 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 13, i32 13>)
+  %27 = xor <2 x i32> %25, %26
+  %28 = lshr <2 x i32> %24, <i32 10, i32 10>
+  %29 = xor <2 x i32> %27, %28
+  %30 = shufflevector <2 x i32> %4, <2 x i32> %12, <2 x i32> <i32 1, i32 2>
+  %31 = add <2 x i32> %30, %29
+  %32 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 15, i32 15>)
+  %33 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 13, i32 13>)
+  %34 = xor <2 x i32> %32, %33
+  %35 = lshr <2 x i32> %31, <i32 10, i32 10>
+  %36 = xor <2 x i32> %34, %35
+  %37 = shufflevector <2 x i32> %12, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>
+  %38 = add <2 x i32> %37, %36
+  %arrayidx918 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 33
+  store <2 x i32> %38, ptr %arrayidx918, align 4
+  %arrayidx1012 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 35
+  %39 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 15, i32 15>)
+  %40 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 13, i32 13>)
+  %41 = xor <2 x i32> %39, %40
+  %42 = lshr <2 x i32> %38, <i32 10, i32 10>
+  %43 = xor <2 x i32> %41, %42
+  %44 = add <2 x i32> %37, %43
+  store <2 x i32> zeroinitializer, ptr %arrayidx1012, align 4
+  %arrayidx1106 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 37
+  %45 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 15, i32 15>)
+  %46 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 13, i32 13>)
+  %47 = xor <2 x i32> %45, %46
+  %48 = lshr <2 x i32> %44, <i32 10, i32 10>
+  %49 = xor <2 x i32> %47, %48
+  %50 = lshr <2 x i32> %24, zeroinitializer
+  %51 = add <2 x i32> %50, %49
+  store <2 x i32> %51, ptr %arrayidx1106, align 4
+  %arrayidx1200 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 39
+  %52 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 15, i32 15>)
+  %53 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 13, i32 13>)
+  %54 = xor <2 x i32> %52, %53
+  %55 = lshr <2 x i32> %51, <i32 10, i32 10>
+  %56 = xor <2 x i32> %54, %55
+  %57 = shufflevector <2 x i32> %38, <2 x i32> zeroinitializer, <2 x i32> <i32 poison, i32 0>
+  %58 = insertelement <2 x i32> %57, i32 0, i64 0
+  %59 = add <2 x i32> %58, %56
+  store <2 x i32> %59, ptr %arrayidx1200, align 4
+  ret void
+
+; uselistorder directives
+  uselistorder <2 x i32> %4, { 7, 0, 1, 6, 5, 4, 3, 2 }
+  uselistorder <2 x i32> %38, { 6, 5, 4, 3, 2, 1, 0 }
+}
+
+declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #2
+
+; uselistorder directives
+uselistorder ptr @llvm.fshl.v2i32, { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nounwind sspstrong memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="inline-asm" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
diff --git a/llvm/test/CodeGen/X86/pr68068.ll b/llvm/test/CodeGen/X86/pr68068.ll
new file mode 100644
index 00000000000000..774b3e311f18e3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr68068.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -fast-isel=1 | FileCheck %s
+
+define float @f() "target-features"="+avx512f" {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = uitofp i15 poison to float
+  ret float %1
+}
diff --git a/llvm/test/CodeGen/X86/rematerialize-sub-super-reg.mir b/llvm/test/CodeGen/X86/rematerialize-sub-super-reg.mir
new file mode 100644
index 00000000000000..b99c5fc8df0cb6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/rematerialize-sub-super-reg.mir
@@ -0,0 +1,169 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# Test the implicit operands and operand flags on remateralized
+# instructions folding into a sub or super register def.
+
+# Test a full register def at %t1 rematerializing into a subregister
+# use. The subregister dest operand should be dead, with an
+# implicit-def of the full physical register.
+
+---
+name: rematerialize_super_register_into_subreg_def
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: rematerialize_super_register_into_subreg_def
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %t3:gr64 = MOV64ri32 -11
+  ; CHECK-NEXT:   CMP64ri8 %t3, 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.2, 4, implicit killed $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $rax = MOV64ri32 -11, implicit-def $eax
+  ; CHECK-NEXT:   CMP64ri8 %t3, 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit killed $eflags
+  ; CHECK-NEXT:   RET 0, $eax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %t3:gr64 = ADD64ri8 %t3, 10, implicit-def $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   $rax = COPY %t3
+  ; CHECK-NEXT:   RET 0, $rax
+  bb.0:
+    %t1:gr64 = MOV64ri32 -11
+    CMP64ri8 %t1, 1, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit killed $eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    %t2:gr64 = COPY %t1
+    $eax = COPY %t2.sub_32bit
+    CMP64ri8 %t2, 1, implicit-def $eflags
+    JCC_1 %bb.1, 4, implicit killed $eflags
+    RET 0, $eax
+
+  bb.2:
+    %t3:gr64 = COPY %t1
+    %t3:gr64 = ADD64ri8 %t3, 10, implicit-def $eflags
+
+  bb.3:
+    $rax = COPY %t3
+    RET 0, $rax
+
+...
+
+# Test an undef subregister def of %t1 rematerializing into a physical
+# super register def
+---
+name:            rematerialize_subregister_into_superreg_def
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: rematerialize_subregister_into_superreg_def
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %t3.sub_32bit:gr64_with_sub_8bit = MOV32ri -11
+  ; CHECK-NEXT:   CMP64ri8 %t3, 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.2, 4, implicit killed $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $eax = MOV32ri -11, implicit-def $rax
+  ; CHECK-NEXT:   CMP64ri8 %t3, 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit killed $eflags
+  ; CHECK-NEXT:   RET 0, $rax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %t3:gr64_with_sub_8bit = ADD64ri8 %t3, 10, implicit-def $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   $rax = COPY %t3
+  ; CHECK-NEXT:   RET 0, $rax
+  bb.0:
+    undef %t1.sub_32bit:gr64_with_sub_8bit = MOV32ri -11
+    CMP64ri8 %t1, 1, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit killed $eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    %t2:gr64 = COPY %t1
+    $rax = COPY %t2
+    CMP64ri8 %t2, 1, implicit-def $eflags
+    JCC_1 %bb.1, 4, implicit killed $eflags
+    RET 0, $rax
+
+  bb.2:
+    %t3:gr64 = COPY %t1
+    %t3:gr64 = ADD64ri8 %t3, 10, implicit-def $eflags
+
+  bb.3:
+    $rax = COPY %t3
+    RET 0, $rax
+
+...
+
+# Handle that rematerializing an instruction with an implicit def of a
+# virtual super register into a physical register works.
+---
+name:            rematerialize_subregister_into_superreg_def_with_impdef_physreg
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: rematerialize_subregister_into_superreg_def_with_impdef_physreg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %t3.sub_32bit:gr64_with_sub_8bit = MOV32ri -11, implicit-def %t3
+  ; CHECK-NEXT:   CMP64ri8 %t3, 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.2, 4, implicit killed $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $eax = MOV32ri -11, implicit-def $rax
+  ; CHECK-NEXT:   CMP64ri8 %t3, 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit killed $eflags
+  ; CHECK-NEXT:   RET 0, $rax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %t3:gr64_with_sub_8bit = ADD64ri8 %t3, 10, implicit-def $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   $rax = COPY %t3
+  ; CHECK-NEXT:   RET 0, $rax
+  bb.0:
+    undef %t1.sub_32bit:gr64_with_sub_8bit = MOV32ri -11, implicit-def %t1
+    CMP64ri8 %t1, 1, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit killed $eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    %t2:gr64 = COPY %t1
+    $rax = COPY %t2
+    CMP64ri8 %t2, 1, implicit-def $eflags
+    JCC_1 %bb.1, 4, implicit killed $eflags
+    RET 0, $rax
+
+  bb.2:
+    %t3:gr64 = COPY %t1
+    %t3:gr64 = ADD64ri8 %t3, 10, implicit-def $eflags
+
+  bb.3:
+    $rax = COPY %t3
+    RET 0, $rax
+
+...
diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
index b6a245a98634ff..e104d9d2b4c157 100644
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -1756,8 +1756,8 @@ define void @PR51226() {
 ; X86-AVX2-LABEL: PR51226:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpslld $16, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vminps %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vmovups %ymm0, (%eax)
@@ -1767,8 +1767,8 @@ define void @PR51226() {
 ; X86-AVX512-LABEL: PR51226:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X86-AVX512-NEXT:    vpslld $16, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vminps %ymm1, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovups %ymm0, (%eax)
@@ -1789,8 +1789,8 @@ define void @PR51226() {
 ; X64-AVX2-LABEL: PR51226:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX2-NEXT:    vpslld $16, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpslld $16, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-AVX2-NEXT:    vminps %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vmovups %ymm0, (%rax)
@@ -1800,8 +1800,8 @@ define void @PR51226() {
 ; X64-AVX512-LABEL: PR51226:
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpslld $16, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-AVX512-NEXT:    vminps %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vmovups %ymm0, (%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 97be882ffd8e43..7893a799c207ab 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -725,17 +725,19 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
 ; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-SLOW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm1
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-SLOW-NEXT:    vporq %zmm2, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-SLOW-NEXT:    vporq %zmm1, %zmm2, %zmm1
 ; AVX512BW-SLOW-NEXT:    movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
 ; AVX512BW-SLOW-NEXT:    kmovq %rcx, %k1
 ; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll
index 50f19d31183e5e..da468b6d809e89 100644
--- a/llvm/test/CodeGen/X86/widen_bitcnt.ll
+++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll
@@ -632,45 +632,71 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm6
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm7, %xmm6, %xmm8
+; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm8, %xmm9
+; AVX2-NEXT:    vpand %xmm4, %xmm9, %xmm4
+; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
+; AVX2-NEXT:    vpaddb %xmm4, %xmm8, %xmm4
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm8
+; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
+; AVX2-NEXT:    vpand %xmm4, %xmm8, %xmm8
+; AVX2-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX2-NEXT:    vpaddw %xmm4, %xmm8, %xmm4
+; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm8
+; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm9
+; AVX2-NEXT:    vpand %xmm7, %xmm9, %xmm9
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm9, %xmm10
+; AVX2-NEXT:    vpand %xmm10, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufb %xmm9, %xmm5, %xmm9
+; AVX2-NEXT:    vpaddb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm9
+; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; AVX2-NEXT:    vpand %xmm9, %xmm8, %xmm9
+; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
+; AVX2-NEXT:    vpaddw %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufb %xmm2, %xmm5, %xmm9
+; AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm10
+; AVX2-NEXT:    vpand %xmm7, %xmm10, %xmm10
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm10, %xmm11
+; AVX2-NEXT:    vpand %xmm11, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
+; AVX2-NEXT:    vpaddb %xmm10, %xmm9, %xmm9
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm10
+; AVX2-NEXT:    vpsrlw $8, %xmm10, %xmm10
+; AVX2-NEXT:    vpand %xmm10, %xmm9, %xmm10
+; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; AVX2-NEXT:    vpaddw %xmm10, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm10
+; AVX2-NEXT:    vpsrlw $4, %xmm3, %xmm11
+; AVX2-NEXT:    vpand %xmm7, %xmm11, %xmm7
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm7, %xmm11
+; AVX2-NEXT:    vpand %xmm11, %xmm10, %xmm10
+; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX2-NEXT:    vpaddb %xmm5, %xmm10, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm3, %xmm6
+; AVX2-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm6
+; AVX2-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX2-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm4
-; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm5
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm5, %ymm8
-; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm4
-; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX2-NEXT:    vpaddb %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm1, %ymm5
-; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm5
-; AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddw %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpaddd %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm4
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm4, %ymm5
-; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm0, %ymm3
-; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    retq
@@ -987,45 +1013,71 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm6
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm7, %xmm6, %xmm8
+; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm8, %xmm9
+; AVX2-NEXT:    vpand %xmm4, %xmm9, %xmm4
+; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
+; AVX2-NEXT:    vpaddb %xmm4, %xmm8, %xmm4
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm8
+; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
+; AVX2-NEXT:    vpand %xmm4, %xmm8, %xmm8
+; AVX2-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX2-NEXT:    vpaddw %xmm4, %xmm8, %xmm4
+; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm8
+; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm9
+; AVX2-NEXT:    vpand %xmm7, %xmm9, %xmm9
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm9, %xmm10
+; AVX2-NEXT:    vpand %xmm10, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufb %xmm9, %xmm5, %xmm9
+; AVX2-NEXT:    vpaddb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm9
+; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; AVX2-NEXT:    vpand %xmm9, %xmm8, %xmm9
+; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
+; AVX2-NEXT:    vpaddw %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufb %xmm2, %xmm5, %xmm9
+; AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm10
+; AVX2-NEXT:    vpand %xmm7, %xmm10, %xmm10
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm10, %xmm11
+; AVX2-NEXT:    vpand %xmm11, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
+; AVX2-NEXT:    vpaddb %xmm10, %xmm9, %xmm9
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm10
+; AVX2-NEXT:    vpsrlw $8, %xmm10, %xmm10
+; AVX2-NEXT:    vpand %xmm10, %xmm9, %xmm10
+; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; AVX2-NEXT:    vpaddw %xmm10, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm10
+; AVX2-NEXT:    vpsrlw $4, %xmm3, %xmm11
+; AVX2-NEXT:    vpand %xmm7, %xmm11, %xmm7
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm7, %xmm11
+; AVX2-NEXT:    vpand %xmm11, %xmm10, %xmm10
+; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX2-NEXT:    vpaddb %xmm5, %xmm10, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm3, %xmm6
+; AVX2-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm6
+; AVX2-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX2-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm4
-; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm5
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm5, %ymm8
-; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm4
-; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX2-NEXT:    vpaddb %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm1, %ymm5
-; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm5
-; AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddw %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpaddd %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm4
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm4, %ymm5
-; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm0, %ymm3
-; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 8f9d2169aa19b9..92973274f29190 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -5682,9 +5682,10 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -5794,9 +5795,10 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -5906,9 +5908,10 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.v
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -5999,9 +6002,10 @@ define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -6205,12 +6209,12 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63]
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -6324,12 +6328,12 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63]
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -6443,12 +6447,12 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63]
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/DebugInfo/COFF/lines-bb-start.ll b/llvm/test/DebugInfo/COFF/lines-bb-start.ll
index 9ff112cf7ae7c3..70c96b368c9ad9 100644
--- a/llvm/test/DebugInfo/COFF/lines-bb-start.ll
+++ b/llvm/test/DebugInfo/COFF/lines-bb-start.ll
@@ -90,6 +90,8 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK:         .cv_loc {{.*}} # t.c:4:5
 ; CHECK:         jmp     LBB{{.*}}
 ; CHECK: LBB2_{{.*}}:                                 # %if.end
+; CHECK-NEXT: L{{.*}}:
+; CHECK-NEXT: DEBUG_VALUE: lea_dbg_value:
 ; CHECK-NEXT:    .cv_loc {{.*}} # t.c:5:3
 ; CHECK:         leal 4(%esp), %[[reg:[^ ]*]]
 ; CHECK:         movl    %[[reg]], (%esp)
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s b/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
index 770cd60a1d70d7..716c4e5abd3fb9 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
@@ -13,20 +13,17 @@
 # CHECK:      Extracted {{.*}} section = .eh_frame
 # CHECK: EHFrameEdgeFixer: Processing .eh_frame in "{{.*}}"...
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is CIE
+# CHECK:     Record is CIE
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is FDE
-# CHECK:         Adding edge at {{.*}} to CIE at: {{.*}}
-# CHECK:         Existing edge at {{.*}} to PC begin at {{.*}}
-# CHECK:         Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is FDE
-# CHECK:         Adding edge at {{.*}} to CIE at: {{.*}}
-# CHECK:         Existing edge at {{.*}} to PC begin at {{.*}}
-# CHECK:         Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
 
 	.text
 	.globl	main
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
index 5f821276288617..4bc006a9a7a2a9 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
@@ -13,15 +13,13 @@
 # CHECK:      Extracted {{.*}} section = __TEXT,__eh_frame
 # CHECK: EHFrameEdgeFixer: Processing __TEXT,__eh_frame in "{{.*}}"...
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is CIE
+# CHECK:     Record is CIE
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is FDE
-# CHECK:         Adding edge at {{.*}} to CIE at: {{.*}}
-# CHECK:         Existing edge at {{.*}} to PC begin at {{.*}}
-# CHECK:         Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
-# CHECK:         Existing edge at {{.*}} to LSDA at {{.*}}
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+# CHECK:       Existing edge at {{.*}} to LSDA at {{.*}}
 
 	.section	__TEXT,__text,regular,pure_instructions
  	.globl	_main
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
index 4a1a1649508eb6..3eb3cba11da262 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
@@ -13,20 +13,17 @@
 # CHECK:      Extracted {{.*}} section = .eh_frame
 # CHECK: EHFrameEdgeFixer: Processing .eh_frame in "{{.*}}"...
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is CIE
+# CHECK:     Record is CIE
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is FDE
-# CHECK:         Adding edge at {{.*}} to CIE at: {{.*}}
-# CHECK:         Existing edge at {{.*}} to PC begin at {{.*}}
-# CHECK:         Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is FDE
-# CHECK:         Adding edge at {{.*}} to CIE at: {{.*}}
-# CHECK:         Existing edge at {{.*}} to PC begin at {{.*}}
-# CHECK:         Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
 
  .text
  .globl main
diff --git a/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s b/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
index 3b84768ce78665..13e9006b7a8a81 100644
--- a/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
@@ -16,15 +16,13 @@
 # CHECK:       Extracted {{.*}} section = .eh_frame
 # CHECK: EHFrameEdgeFixer: Processing .eh_frame in "{{.*}}"...
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is CIE
+# CHECK:     Record is CIE
 # CHECK:   Processing block at
-# CHECK:     Processing CFI record at
-# CHECK:       Record is FDE
-# CHECK:         Adding edge at {{.*}} to CIE at: {{.*}}
-# CHECK:         Processing PC-begin at
-# CHECK:         Existing edge at {{.*}} to PC begin at {{.*}}
-# CHECK:         Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Processing PC-begin at
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
 
 	.text
 	.abiversion 2
diff --git a/llvm/test/Instrumentation/AddressSanitizer/basic.ll b/llvm/test/Instrumentation/AddressSanitizer/basic.ll
index 2064db5a0918fc..068d6d18cd45eb 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/basic.ll
@@ -210,10 +210,8 @@ define void @test_swifterror_3() sanitize_address {
 
 ;; ctor/dtor have the nounwind attribute. See uwtable.ll, they additionally have
 ;; the uwtable attribute with the module flag "uwtable".
-; CHECK: define internal void @asan.module_ctor() #[[#ATTR:]] comdat {
+; CHECK: define internal void @asan.module_ctor() #[[#ATTR:]] {{(comdat )?}}{
 ; CHECK: call void @__asan_init()
-;; __asan_register_elf_globals is called even if this module does not contain instrumented global variables.
-; CHECK: call void @__asan_register_elf_globals(i64 ptrtoint (ptr @___asan_globals_registered to i64), i64 ptrtoint (ptr @__start_asan_globals to i64), i64 ptrtoint (ptr @__stop_asan_globals to i64))
 
 ; CHECK: attributes #[[#ATTR]] = { nounwind }
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/global_metadata_array.ll b/llvm/test/Instrumentation/AddressSanitizer/global_metadata_array.ll
index 1617bf9b67aa7c..0e4d1f168d0163 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/global_metadata_array.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/global_metadata_array.ll
@@ -1,12 +1,8 @@
-; RUN: rm -rf %t && split-file %s %t && cd %t
-; RUN: opt < a.ll -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=CHECK %s
-; RUN: opt < a.ll -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-apple-macosx10.11.0 -S | FileCheck --check-prefix=CHECK %s
-; RUN: opt < a.ll -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-pc-windows-msvc19.0.24215 -S | FileCheck --check-prefix=CHECK %s
-; RUN: opt < a.ll -passes=asan -asan-globals-live-support=0 -asan-mapping-scale=5 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+; RUN: opt < %s -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=CHECK %s
+; RUN: opt < %s -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-apple-macosx10.11.0 -S | FileCheck --check-prefix=CHECK %s
+; RUN: opt < %s -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-pc-windows-msvc19.0.24215 -S | FileCheck --check-prefix=CHECK %s
+; RUN: opt < %s -passes=asan -asan-globals-live-support=0 -asan-mapping-scale=5 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
 
-; RUN: opt < empty.ll -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=ELF-NOGC %s
-
-;--- a.ll
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Globals:
@@ -63,10 +59,3 @@ attributes #1 = { nounwind sanitize_address "less-precise-fpmad"="false" "frame-
 !7 = !{!"/tmp/asan-globals.cpp", i32 7, i32 5}
 !8 = !{!"/tmp/asan-globals.cpp", i32 12, i32 14}
 !9 = !{!"/tmp/asan-globals.cpp", i32 14, i32 25}
-
-;; In the presence of instrumented global variables, asan.module_ctor do not use comdat.
-; CHECK: define internal void @asan.module_ctor() #[[#ATTR:]] {
-
-; ELF-NOGC: define internal void @asan.module_ctor() #[[#ATTR:]] comdat {
-
-;--- empty.ll
diff --git a/llvm/test/Instrumentation/AddressSanitizer/global_with_comdat.ll b/llvm/test/Instrumentation/AddressSanitizer/global_with_comdat.ll
index 699b8287d358ad..47bb1f102e2fc2 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/global_with_comdat.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/global_with_comdat.ll
@@ -4,14 +4,10 @@
 ; We keep using comdats for garbage collection if odr indicators are
 ; enabled as indicator symbols will cause link time odr violations.
 ; This is to fix PR 47925.
-; RUN: rm -rf %t && split-file %s %t && cd %t
-; RUN: opt < a.ll -passes=asan -asan-globals-live-support=1 -asan-use-odr-indicator=0 -S | FileCheck %s --check-prefixes=CHECK,NOCOMDAT
+;
+; RUN: opt < %s -passes=asan -asan-globals-live-support=1 -asan-use-odr-indicator=0 -S | FileCheck %s --check-prefixes=CHECK,NOCOMDAT
 ; Check that enabling odr indicators enables comdat for globals.
-; RUN: opt < a.ll -passes=asan -asan-globals-live-support=1 -S | FileCheck %s --check-prefixes=CHECK,COMDAT
-
-; RUN: opt < no_module_id.ll -passes=asan -S | FileCheck %s --check-prefix=NOMODULEID
-
-;--- a.ll
+; RUN: opt < %s -passes=asan -asan-globals-live-support=1 -S | FileCheck %s --check-prefixes=CHECK,COMDAT
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -91,43 +87,3 @@ attributes #1 = { nounwind sanitize_address "less-precise-fpmad"="false" "frame-
 !7 = !{!"/tmp/asan-globals.cpp", i32 7, i32 5}
 !8 = !{!"/tmp/asan-globals.cpp", i32 12, i32 14}
 !9 = !{!"/tmp/asan-globals.cpp", i32 14, i32 25}
-
-;--- no_module_id.ll
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; NOMODULEID:      $asan.module_ctor = comdat any
-; NOMODULEID:      $asan.module_dtor = comdat any
-
-;; Don't place the instrumented globals in a comdat when the unique module ID is empty.
-; NOMODULEID:      @.str = internal constant { [4 x i8], [28 x i8] } { [4 x i8] c"str\00", [28 x i8] zeroinitializer }, align 32
-; NOMODULEID:      @_ZL3buf = internal global { [4 x i8], [28 x i8] } zeroinitializer, align 32
-; NOMODULEID:      @__asan_global_.str = private global {{.*}}, section "asan_globals", !associated !0
-; NOMODULEID:      @__asan_global__ZL3buf = private global {{.*}}, section "asan_globals", !associated !1
-; NOMODULEID:      @llvm.compiler.used = appending global [4 x ptr] [ptr @.str, ptr @_ZL3buf, ptr @__asan_global_.str, ptr @__asan_global__ZL3buf]
-
-; NOMODULEID:      define internal void @asan.module_ctor() #[[#]] comdat {
-; NOMODULEID-NEXT:   call void @__asan_init()
-; NOMODULEID-NEXT:   call void @__asan_version_mismatch_check_v8()
-; NOMODULEID-NEXT:   call void @__asan_register_elf_globals(i64 ptrtoint (ptr @___asan_globals_registered to i64), i64 ptrtoint (ptr @__start_asan_globals to i64), i64 ptrtoint (ptr @__stop_asan_globals to i64))
-; NOMODULEID-NEXT:   ret void
-; NOMODULEID-NEXT: }
-
-; NOMODULEID:      !0 = !{ptr @.str}
-; NOMODULEID:      !1 = !{ptr @_ZL3buf}
-
-@.str = private unnamed_addr constant [4 x i8] c"str\00", align 1
-@_ZL3buf = internal unnamed_addr global [4 x i8] zeroinitializer, align 1
-@llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_a.cc, ptr null }]
-
-declare void @ext(ptr noundef)
-
-; Function Attrs: uwtable
-define internal void @_GLOBAL__sub_I_a.cc() #2 section ".text.startup" {
-entry:
-  %0 = load i8, ptr @_ZL3buf, align 1
-  %inc = add i8 %0, 1
-  store i8 %inc, ptr @_ZL3buf, align 1
-  tail call void @ext(ptr noundef nonnull @.str)
-  ret void
-}
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
index dc9731a2cd8bc4..1fcb6047797e38 100644
--- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
@@ -1,8 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; Test -sanitizer-coverage-trace-compares=1 API declarations on a non-x86_64 arch
 ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1  -S | FileCheck %s
 
 target triple = "i386-unknown-linux-gnu"
 define i32 @foo() #0 {
+; CHECK-LABEL: define i32 @foo() comdat {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   ret i32 0
 }
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding.txt
index 1050a324767e07..567c4fa5bd5ab5 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding.txt
@@ -73,6 +73,12 @@
 # CHECK: sc
 0x44 0x00 0x00 0x02
 
+# CHECK: scv 1                            
+0x44 0x00 0x00 0x21
+
+# CHECK: scv 2                            
+0x44 0x00 0x00 0x41
+
 # CHECK: clrbhrb
 0x7c 0x00 0x03 0x5c
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding.txt
index bdade9867ed6fb..15427e91a5b2aa 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding.txt
@@ -73,6 +73,12 @@
 # CHECK: sc
 0x02 0x00 0x00 0x44
 
+# CHECK: scv 1
+0x21 0x00 0x00 0x44
+
+# CHECK: scv 2
+0x41 0x00 0x00 0x44
+
 # CHECK: clrbhrb
 0x5c 0x03 0x00 0x7c
 
diff --git a/llvm/test/MC/Disassembler/RISCV/colored.txt b/llvm/test/MC/Disassembler/RISCV/colored.txt
index 780ec43e89358c..e79ae73955893a 100644
--- a/llvm/test/MC/Disassembler/RISCV/colored.txt
+++ b/llvm/test/MC/Disassembler/RISCV/colored.txt
@@ -1,6 +1,6 @@
 # UNSUPPORTED: system-windows
-# RUN: llvm-mc -triple=riscv64 -mattr=+zcmp,+experimental-zfa,+v --cdis %s | FileCheck %s --strict-whitespace --match-full-lines -check-prefixes=CHECK,ASM,ABINAME
-# RUN: llvm-mc -triple=riscv64 -mattr=+zcmp,+experimental-zfa,+v -M numeric --cdis %s | FileCheck %s --strict-whitespace --match-full-lines -check-prefixes=CHECK,ASM,ARCHNAME
+# RUN: llvm-mc -triple=riscv64 -mattr=+zcmp,+zfa,+v --cdis %s | FileCheck %s --strict-whitespace --match-full-lines -check-prefixes=CHECK,ASM,ABINAME
+# RUN: llvm-mc -triple=riscv64 -mattr=+zcmp,+zfa,+v -M numeric --cdis %s | FileCheck %s --strict-whitespace --match-full-lines -check-prefixes=CHECK,ASM,ARCHNAME
 
 # CHECK:	.text
 # Registers and immediates
diff --git a/llvm/test/MC/PowerPC/ppc64-encoding.s b/llvm/test/MC/PowerPC/ppc64-encoding.s
index 149ee005b4c301..36bef5e39bd94e 100644
--- a/llvm/test/MC/PowerPC/ppc64-encoding.s
+++ b/llvm/test/MC/PowerPC/ppc64-encoding.s
@@ -144,7 +144,7 @@
 # CHECK-LE: mcrf 2, 3                       # encoding: [0x00,0x00,0x0c,0x4d]
             mcrf 2, 3
 
-# System call instruction
+# System call instructions
 
 # CHECK-BE: sc 1                            # encoding: [0x44,0x00,0x00,0x22]
 # CHECK-LE: sc 1                            # encoding: [0x22,0x00,0x00,0x44]
@@ -153,6 +153,13 @@
 # CHECK-LE: sc                              # encoding: [0x02,0x00,0x00,0x44]
             sc
 
+# CHECK-BE: scv 1                            # encoding: [0x44,0x00,0x00,0x21]
+# CHECK-LE: scv 1                            # encoding: [0x21,0x00,0x00,0x44]
+            scv 1
+# CHECK-BE: scv 2                            # encoding: [0x44,0x00,0x00,0x41]
+# CHECK-LE: scv 2                            # encoding: [0x41,0x00,0x00,0x44]
+            scv 2
+
 # Branch history rolling buffer
 
 # CHECK-BE: clrbhrb                         # encoding: [0x7c,0x00,0x03,0x5c]
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index bf40eda456edf1..12c494dc896a0a 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -261,8 +261,8 @@
 .attribute arch, "rv32izifencei2p0"
 # CHECK: attribute      5, "rv32i2p1_zifencei2p0"
 
-.attribute arch, "rv32izfa0p2"
-# CHECK: attribute      5, "rv32i2p1_f2p2_zicsr2p0_zfa0p2"
+.attribute arch, "rv32izfa1p0"
+# CHECK: attribute      5, "rv32i2p1_f2p2_zicsr2p0_zfa1p0"
 
 .attribute arch, "rv32izicond1p0"
 # CHECK: attribute      5, "rv32i2p1_zicond1p0"
diff --git a/llvm/test/MC/RISCV/rv32zfa-only-valid.s b/llvm/test/MC/RISCV/rv32zfa-only-valid.s
index 2cf5c1c42f3e7a..d212659d5208d7 100644
--- a/llvm/test/MC/RISCV/rv32zfa-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32zfa-only-valid.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zfa,+d,+zfh < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zfa,+d,+zfh -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \
+# RUN:     | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \
diff --git a/llvm/test/MC/RISCV/zfa-double-invalid.s b/llvm/test/MC/RISCV/zfa-double-invalid.s
index 3a92b18d6b19d0..ec21b0c613375a 100644
--- a/llvm/test/MC/RISCV/zfa-double-invalid.s
+++ b/llvm/test/MC/RISCV/zfa-double-invalid.s
@@ -1,7 +1,7 @@
-# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zfa,+zfh \
+# RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+zfh \
 # RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTD %s
-# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zfa,+zfh \
+# RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+zfh \
 # RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTD %s
 
diff --git a/llvm/test/MC/RISCV/zfa-half-invalid.s b/llvm/test/MC/RISCV/zfa-half-invalid.s
index f916c9bd66daa5..a2c6f09043084f 100644
--- a/llvm/test/MC/RISCV/zfa-half-invalid.s
+++ b/llvm/test/MC/RISCV/zfa-half-invalid.s
@@ -1,7 +1,7 @@
-# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zfa,+d \
+# RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+d \
 # RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTZFH %s
-# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zfa,+d \
+# RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+d \
 # RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTZFH %s
 
diff --git a/llvm/test/MC/RISCV/zfa-invalid.s b/llvm/test/MC/RISCV/zfa-invalid.s
index e48618506626be..c2537c3fc51024 100644
--- a/llvm/test/MC/RISCV/zfa-invalid.s
+++ b/llvm/test/MC/RISCV/zfa-invalid.s
@@ -1,5 +1,5 @@
-# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zfa,+d,+zfh < %s 2>&1 | FileCheck -check-prefixes=CHECK-NO-RV32 %s
-# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zfa,+d,+zfh < %s 2>&1 | FileCheck -check-prefixes=CHECK-NO-RV64 %s
+# RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+d,+zfh < %s 2>&1 | FileCheck -check-prefixes=CHECK-NO-RV32 %s
+# RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+d,+zfh < %s 2>&1 | FileCheck -check-prefixes=CHECK-NO-RV64 %s
 
 # Invalid rounding modes
 # CHECK-NO-RV64: error: operand must be 'rtz' floating-point rounding mode
diff --git a/llvm/test/MC/RISCV/zfa-valid.s b/llvm/test/MC/RISCV/zfa-valid.s
index 5207746570558c..e951c9da2ba788 100644
--- a/llvm/test/MC/RISCV/zfa-valid.s
+++ b/llvm/test/MC/RISCV/zfa-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zfa,+d,+zfh < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zfa,+d,+zfh -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \
+# RUN:     | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zfa,+d,+zfh < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zfa,+d,+zfh -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfa,+d,+zfh < %s \
+# RUN:     | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \
diff --git a/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s b/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
index 6506404d966bf2..6b5dc9200f34cc 100644
--- a/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
+++ b/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zfa,+zfhmin,+zvfh < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zfa,+zfhmin,+zvfh -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh < %s \
+# RUN:     | llvm-objdump --mattr=+zfa,+zfhmin,+zvfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zfa,+zfhmin,+zvfh < %s \
-# RUN:     | llvm-objdump --mattr=+experimental-zfa,+zfhmin,+zvfh -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfa,+zfhmin,+zvfh < %s \
+# RUN:     | llvm-objdump --mattr=+zfa,+zfhmin,+zvfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
diff --git a/llvm/test/MC/WebAssembly/reloc-pic.s b/llvm/test/MC/WebAssembly/reloc-pic.s
index dd5d315cc2ce03..974d6cb8969890 100644
--- a/llvm/test/MC/WebAssembly/reloc-pic.s
+++ b/llvm/test/MC/WebAssembly/reloc-pic.s
@@ -1,5 +1,5 @@
-# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj < %s | obj2yaml | FileCheck %s
-# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+reference-types -filetype=obj < %s | obj2yaml | FileCheck --check-prefix=REF %s
+# RUN: sed -e '/^REF-/d' %s | llvm-mc -triple=wasm32-unknown-unknown -filetype=obj | obj2yaml | FileCheck %s
+# RUN: sed -e 's/^REF-//g' %s | llvm-mc -triple=wasm32-unknown-unknown -mattr=+reference-types -filetype=obj | obj2yaml | FileCheck --check-prefix=REF %s
 
 # Verify that @GOT relocation entryes result in R_WASM_GLOBAL_INDEX_LEB against
 # against the corrsponding function or data symbol and that the corresponding
@@ -50,6 +50,9 @@ hidden_func:
 #.hidden hidden_data
 .size default_data, 4
 
+REF-mytable:
+REF-.tabletype mytable, externref
+
 # CHECK:      --- !WASM
 # CHECK-NEXT: FileHeader:
 # CHECK-NEXT:   Version:         0x1
@@ -209,6 +212,11 @@ hidden_func:
 # CHECK-NEXT:         Function:        5
 # REF:              - Index:           10
 # REF-NEXT:           Kind:            TABLE
+# REF-NEXT:           Name:            mytable
+# REF-NEXT:           Flags:           [ BINDING_LOCAL ]
+# REF-NEXT:           Table:           1
+# REF-NEXT:         - Index:           11
+# REF-NEXT:           Kind:            TABLE
 # REF-NEXT:           Name:            __indirect_function_table
 # REF-NEXT:           Flags:           [ UNDEFINED, NO_STRIP ]
 # REF-NEXT:           Table:           0
diff --git a/llvm/test/ObjectYAML/DXContainer/SignatureParts.yaml b/llvm/test/ObjectYAML/DXContainer/SignatureParts.yaml
new file mode 100644
index 00000000000000..81439a00c36de6
--- /dev/null
+++ b/llvm/test/ObjectYAML/DXContainer/SignatureParts.yaml
@@ -0,0 +1,254 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !dxcontainer
+Header:
+  Hash:            [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  Version:
+    Major:           1
+    Minor:           0
+  FileSize:        600
+  PartCount:       3
+  PartOffsets:     [ 64, 124, 184 ]
+Parts:
+  - Name:            ISG1
+    Size:            52
+    Signature:
+      Parameters:
+        - Stream:          0
+          Name:            AAA_HSFoo
+          Index:           0
+          SystemValue:     Undefined
+          CompType:        Float32
+          Register:        0
+          Mask:            7
+          ExclusiveMask:   2
+          MinPrecision:    Default
+  - Name:            OSG1
+    Size:            52
+    Signature:
+      Parameters:
+        - Stream:          0
+          Name:            SV_Position
+          Index:           0
+          SystemValue:     Position
+          CompType:        Float32
+          Register:        0
+          Mask:            15
+          ExclusiveMask:   0
+          MinPrecision:    Default
+  - Name:            PSG1
+    Size:            372
+    Signature:
+      Parameters:
+        - Stream:          0
+          Name:            SV_TessFactor
+          Index:           0
+          SystemValue:     FinalQuadEdgeTessfactor
+          CompType:        Float32
+          Register:        0
+          Mask:            8
+          ExclusiveMask:   8
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            BBB
+          Index:           0
+          SystemValue:     Undefined
+          CompType:        Float32
+          Register:        0
+          Mask:            7
+          ExclusiveMask:   0
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            SV_TessFactor
+          Index:           1
+          SystemValue:     FinalQuadEdgeTessfactor
+          CompType:        Float32
+          Register:        1
+          Mask:            8
+          ExclusiveMask:   8
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            BBB
+          Index:           1
+          SystemValue:     Undefined
+          CompType:        Float32
+          Register:        1
+          Mask:            7
+          ExclusiveMask:   0
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            SV_TessFactor
+          Index:           2
+          SystemValue:     FinalQuadEdgeTessfactor
+          CompType:        Float32
+          Register:        2
+          Mask:            8
+          ExclusiveMask:   8
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            BBB
+          Index:           2
+          SystemValue:     Undefined
+          CompType:        Float32
+          Register:        2
+          Mask:            7
+          ExclusiveMask:   0
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            SV_TessFactor
+          Index:           3
+          SystemValue:     FinalQuadEdgeTessfactor
+          CompType:        Float32
+          Register:        3
+          Mask:            8
+          ExclusiveMask:   8
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            SV_InsideTessFactor
+          Index:           0
+          SystemValue:     FinalQuadInsideTessfactor
+          CompType:        Float32
+          Register:        4
+          Mask:            8
+          ExclusiveMask:   0
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            SV_InsideTessFactor
+          Index:           1
+          SystemValue:     FinalQuadInsideTessfactor
+          CompType:        Float32
+          Register:        5
+          Mask:            8
+          ExclusiveMask:   0
+          MinPrecision:    Default
+        - Stream:          0
+          Name:            AAA
+          Index:           0
+          SystemValue:     Undefined
+          CompType:        Float32
+          Register:        6
+          Mask:            15
+          ExclusiveMask:   4
+          MinPrecision:    Default
+...
+
+# CHECK: - Name:            ISG1
+# CHECK-NEXT:   Size:            52
+# CHECK-NEXT:   Signature:
+# CHECK-NEXT:     Parameters:
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            AAA_HSFoo
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:         SystemValue:     Undefined
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        0
+# CHECK-NEXT:         Mask:            7
+# CHECK-NEXT:         ExclusiveMask:   2
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT: - Name:            OSG1
+# CHECK-NEXT:   Size:            52
+# CHECK-NEXT:   Signature:
+# CHECK-NEXT:     Parameters:
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            SV_Position
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:         SystemValue:     Position
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        0
+# CHECK-NEXT:         Mask:            15
+# CHECK-NEXT:         ExclusiveMask:   0
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT: - Name:            PSG1
+# CHECK-NEXT:   Size:            372
+# CHECK-NEXT:   Signature:
+# CHECK-NEXT:     Parameters:
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            SV_TessFactor
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:         SystemValue:     FinalQuadEdgeTessfactor
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        0
+# CHECK-NEXT:         Mask:            8
+# CHECK-NEXT:         ExclusiveMask:   8
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            BBB
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:         SystemValue:     Undefined
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        0
+# CHECK-NEXT:         Mask:            7
+# CHECK-NEXT:         ExclusiveMask:   0
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            SV_TessFactor
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         SystemValue:     FinalQuadEdgeTessfactor
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        1
+# CHECK-NEXT:         Mask:            8
+# CHECK-NEXT:         ExclusiveMask:   8
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            BBB
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         SystemValue:     Undefined
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        1
+# CHECK-NEXT:         Mask:            7
+# CHECK-NEXT:         ExclusiveMask:   0
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            SV_TessFactor
+# CHECK-NEXT:         Index:           2
+# CHECK-NEXT:         SystemValue:     FinalQuadEdgeTessfactor
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        2
+# CHECK-NEXT:         Mask:            8
+# CHECK-NEXT:         ExclusiveMask:   8
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            BBB
+# CHECK-NEXT:         Index:           2
+# CHECK-NEXT:         SystemValue:     Undefined
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        2
+# CHECK-NEXT:         Mask:            7
+# CHECK-NEXT:         ExclusiveMask:   0
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            SV_TessFactor
+# CHECK-NEXT:         Index:           3
+# CHECK-NEXT:         SystemValue:     FinalQuadEdgeTessfactor
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        3
+# CHECK-NEXT:         Mask:            8
+# CHECK-NEXT:         ExclusiveMask:   8
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            SV_InsideTessFactor
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:         SystemValue:     FinalQuadInsideTessfactor
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        4
+# CHECK-NEXT:         Mask:            8
+# CHECK-NEXT:         ExclusiveMask:   0
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            SV_InsideTessFactor
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         SystemValue:     FinalQuadInsideTessfactor
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        5
+# CHECK-NEXT:         Mask:            8
+# CHECK-NEXT:         ExclusiveMask:   0
+# CHECK-NEXT:         MinPrecision:    Default
+# CHECK-NEXT:       - Stream:          0
+# CHECK-NEXT:         Name:            AAA
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:         SystemValue:     Undefined
+# CHECK-NEXT:         CompType:        Float32
+# CHECK-NEXT:         Register:        6
+# CHECK-NEXT:         Mask:            15
+# CHECK-NEXT:         ExclusiveMask:   4
+# CHECK-NEXT:         MinPrecision:    Default
diff --git a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td b/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td
index 2b6b3ed977ebf7..d07ef4e300ee5f 100644
--- a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td
+++ b/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td
@@ -5,6 +5,7 @@
 // CHECK: // PatFrag predicates.
 // CHECK-NEXT: enum {
 // CHECK-NEXT:   GICXXPred_MI_Predicate_and_or_pat = GICXXPred_Invalid + 1,
+// CHECK-NEXT:   GICXXPred_MI_Predicate_mul_pat,
 // CHECK-NEXT:   GICXXPred_MI_Predicate_or_oneuse,
 // CHECK-NEXT:   GICXXPred_MI_Predicate_patfrags_test_pat,
 // CHECK-NEXT:   GICXXPred_MI_Predicate_sub3_pat,
@@ -15,6 +16,8 @@
 // CHECK: bool MyTargetInstructionSelector::testMIPredicate_MI(
 // CHECK:    case GICXXPred_MI_Predicate_and_or_pat: {
 // CHECK:      return doesComplexCheck(MI);
+// CHECK:    case GICXXPred_MI_Predicate_mul_pat: {
+// CHECK:      return doesComplexCheck(MI);
 // CHECK:    case GICXXPred_MI_Predicate_or_oneuse: {
 // CHECK:      return MRI.hasOneNonDBGUse(MI.getOperand(0).getReg());
 // CHECK:    case GICXXPred_MI_Predicate_patfrags_test_pat: {
@@ -49,6 +52,7 @@ def D0 : MyReg<"d0", [S0, S1]>;
 def DRegs : MyClass<32, [i32], (sequence "D%u", 0, 0)>;
 def DOP : RegisterOperand<DRegs>;
 def AND_OR : I<(outs DRegs:$dst), (ins DOP:$src0, DOP:$src1, DOP:$src2), []>;
+def MUL_OR : I<(outs DRegs:$dst), (ins DOP:$src0, DOP:$src1, DOP:$src2), []>;
 
 def or_oneuse : PatFrag<
   (ops node:$x, node:$y),
@@ -69,7 +73,7 @@ def and_or_pat : PatFrag<
   let PredicateCodeUsesOperands = 1;
 }
 
-// CHECK: GIM_Try, /*On fail goto*//*Label 0*/ 99, // Rule ID 6 //
+// CHECK: GIM_Try, /*On fail goto*//*Label 0*/ 99, // Rule ID 7 //
 // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
 // CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_AND,
 // CHECK-NEXT: // MIs[0] dst
@@ -135,6 +139,79 @@ def : Pat<
   (AND_OR DOP:$src0, DOP:$src1, DOP:$src2)
 >;
 
+def mul_pat : PatFrag<
+  (ops node:$x, node:$y),
+  (mul node:$x, node:$y), [{ return foo(); }]> {
+  let GISelPredicateCode = [{
+    return doesComplexCheck(MI);
+  }];
+  let PredicateCodeUsesOperands = 1;
+}
+
+// CHECK: GIM_Try, /*On fail goto*//*Label 2*/ 293, // Rule ID 4 //
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/Test::DRegsRegClassID,
+// CHECK-NEXT: // MIs[0] Operand 1
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/0, /*Op*/1, /*StoreIdx*/0, // Name : pred:4:x
+// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_OR,
+// CHECK-NEXT: // MIs[1] Operand 0
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: // MIs[1] src0
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/Test::DRegsRegClassID,
+// CHECK-NEXT: // MIs[1] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/Test::DRegsRegClassID,
+// CHECK-NEXT: // MIs[0] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/0, /*Op*/2, /*StoreIdx*/1, // Name : pred:4:y
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/Test::DRegsRegClassID,
+// CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_mul_pat,
+// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT: // (mul:{ *:[i32] } (or:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1):$pred:4:x, DOP:{ *:[i32] }:$src2:$pred:4:y)<<P:4:Predicate_mul_pat>>  =>  (MUL_OR:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1, DOP:{ *:[i32] }:$src2)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MUL_OR,
+
+// CHECK: GIM_Try, /*On fail goto*//*Label 3*/ 388, // Rule ID 8 //
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL,
+// CHECK-NEXT: // MIs[0] dst
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/Test::DRegsRegClassID,
+// CHECK-NEXT: // MIs[0] src2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/0, /*Op*/1, /*StoreIdx*/1, // Name : pred:4:y
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/Test::DRegsRegClassID,
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_RecordNamedOperand, /*MI*/0, /*Op*/2, /*StoreIdx*/0, // Name : pred:4:x
+// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/2, // MIs[1]
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_OR,
+// CHECK-NEXT: // MIs[1] Operand 0
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: // MIs[1] src0
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/Test::DRegsRegClassID,
+// CHECK-NEXT: // MIs[1] src1
+// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/Test::DRegsRegClassID,
+// CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GICXXPred_MI_Predicate_mul_pat,
+// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1,
+// CHECK-NEXT: // (mul:{ *:[i32] } DOP:{ *:[i32] }:$src2:$pred:4:y, (or:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1):$pred:4:x)<<P:4:Predicate_mul_pat>>  =>  (MUL_OR:{ *:[i32] } DOP:{ *:[i32] }:$src0, DOP:{ *:[i32] }:$src1, DOP:{ *:[i32] }:$src2)
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MUL_OR,
+
+// Test commutative patterns where named operands in the source pattern are not
+// directly bound to PatFrag's operands.
+def : Pat<
+  (i32 (mul_pat (or DOP:$src0, DOP:$src1), DOP:$src2)),
+  (MUL_OR DOP:$src0, DOP:$src1, DOP:$src2)
+>;
 
 def sub3_pat : PatFrag<
   (ops node:$x, node:$y, node:$z),
@@ -146,7 +223,7 @@ def sub3_pat : PatFrag<
   let PredicateCodeUsesOperands = 1;
 }
 
-// CHECK: GIM_Try, /*On fail goto*//*Label 2*/ 285, // Rule ID 0 //
+// CHECK: GIM_Try, /*On fail goto*//*Label 4*/ 475, // Rule ID 0 //
 // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
 // CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SUB,
 // CHECK-NEXT: // MIs[0] dst
@@ -192,16 +269,16 @@ def patfrags_test_pat : PatFrags<
   let PredicateCodeUsesOperands = 1;
 }
 
-// CHECK: GIM_Try, /*On fail goto*//*Label 3*/ 372, // Rule ID 1 //
+// CHECK: GIM_Try, /*On fail goto*//*Label 5*/ 562, // Rule ID 1 //
 // CHECK: // (xor:{ *:[i32] } (add:{ *:[i32] } i32:{ *:[i32] }:$src0:$pred:2:x, i32:{ *:[i32] }:$src1:$pred:2:y), i32:{ *:[i32] }:$src2:$pred:2:z)<<P:2:Predicate_patfrags_test_pat>>  =>  (PATFRAGS:{ *:[i32] } i32:{ *:[i32] }:$src0, i32:{ *:[i32] }:$src1, i32:{ *:[i32] }:$src2)
 
-// CHECK: GIM_Try, /*On fail goto*//*Label 4*/ 459, // Rule ID 2 //
+// CHECK: GIM_Try, /*On fail goto*//*Label 6*/ 649, // Rule ID 2 //
 // CHECK: // (xor:{ *:[i32] } (sub:{ *:[i32] } i32:{ *:[i32] }:$src0:$pred:2:x, i32:{ *:[i32] }:$src1:$pred:2:y), i32:{ *:[i32] }:$src2:$pred:2:z)<<P:2:Predicate_patfrags_test_pat>>  =>  (PATFRAGS:{ *:[i32] } i32:{ *:[i32] }:$src0, i32:{ *:[i32] }:$src1, i32:{ *:[i32] }:$src2)
 
-// CHECK: GIM_Try, /*On fail goto*//*Label 5*/ 546, // Rule ID 4 //
+// CHECK: GIM_Try, /*On fail goto*//*Label 7*/ 736, // Rule ID 5 //
 // CHECK: // (xor:{ *:[i32] } i32:{ *:[i32] }:$src2:$pred:2:z, (add:{ *:[i32] } i32:{ *:[i32] }:$src0:$pred:2:x, i32:{ *:[i32] }:$src1:$pred:2:y))<<P:2:Predicate_patfrags_test_pat>>  =>  (PATFRAGS:{ *:[i32] } i32:{ *:[i32] }:$src0, i32:{ *:[i32] }:$src1, i32:{ *:[i32] }:$src2)
 
-// CHECK: GIM_Try, /*On fail goto*//*Label 6*/ 633, // Rule ID 5 //
+// CHECK: GIM_Try, /*On fail goto*//*Label 8*/ 823, // Rule ID 6 //
 // CHECK: // (xor:{ *:[i32] } i32:{ *:[i32] }:$src2:$pred:2:z, (sub:{ *:[i32] } i32:{ *:[i32] }:$src0:$pred:2:x, i32:{ *:[i32] }:$src1:$pred:2:y))<<P:2:Predicate_patfrags_test_pat>>  =>  (PATFRAGS:{ *:[i32] } i32:{ *:[i32] }:$src0, i32:{ *:[i32] }:$src1, i32:{ *:[i32] }:$src2)
 
 
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
index 60a5128a889c2b..9a327f087bcd16 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
@@ -14,35 +14,35 @@ define i1 @test_cmpxchg_seq_cst(ptr %addr, i128 %desire, i128 %new) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i128 [[NEW]], 64
 ; CHECK-NEXT:    [[NEW_HI:%.*]] = trunc i128 [[TMP1]] to i64
 ; CHECK-NEXT:    call void @llvm.ppc.sync()
-; CHECK-NEXT:    [[TMP3:%.*]] = call { i64, i64 } @llvm.ppc.cmpxchg.i128(ptr [[ADDR:%.*]], i64 [[CMP_LO]], i64 [[CMP_HI]], i64 [[NEW_LO]], i64 [[NEW_HI]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call { i64, i64 } @llvm.ppc.cmpxchg.i128(ptr [[ADDR:%.*]], i64 [[CMP_LO]], i64 [[CMP_HI]], i64 [[NEW_LO]], i64 [[NEW_HI]])
 ; CHECK-NEXT:    call void @llvm.ppc.lwsync()
-; CHECK-NEXT:    [[LO:%.*]] = extractvalue { i64, i64 } [[TMP3]], 0
-; CHECK-NEXT:    [[HI:%.*]] = extractvalue { i64, i64 } [[TMP3]], 1
+; CHECK-NEXT:    [[LO:%.*]] = extractvalue { i64, i64 } [[TMP2]], 0
+; CHECK-NEXT:    [[HI:%.*]] = extractvalue { i64, i64 } [[TMP2]], 1
 ; CHECK-NEXT:    [[LO64:%.*]] = zext i64 [[LO]] to i128
 ; CHECK-NEXT:    [[HI64:%.*]] = zext i64 [[HI]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i128 [[HI64]], 64
-; CHECK-NEXT:    [[VAL64:%.*]] = or i128 [[LO64]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { i128, i1 } poison, i128 [[VAL64]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i128 [[HI64]], 64
+; CHECK-NEXT:    [[VAL64:%.*]] = or i128 [[LO64]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { i128, i1 } poison, i128 [[VAL64]], 0
 ; CHECK-NEXT:    [[SUCCESS:%.*]] = icmp eq i128 [[DESIRE]], [[VAL64]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { i128, i1 } [[TMP5]], i1 [[SUCCESS]], 1
-; CHECK-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP6]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { i128, i1 } [[TMP4]], i1 [[SUCCESS]], 1
+; CHECK-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
 ; CHECK-NEXT:    ret i1 [[SUCC]]
 ;
 ; PWR7-LABEL: @test_cmpxchg_seq_cst(
 ; PWR7-NEXT:  entry:
+; PWR7-NEXT:    [[TMP0:%.*]] = alloca i128, align 8
+; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP0]])
+; PWR7-NEXT:    store i128 [[DESIRE:%.*]], ptr [[TMP0]], align 8
 ; PWR7-NEXT:    [[TMP1:%.*]] = alloca i128, align 8
 ; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP1]])
-; PWR7-NEXT:    store i128 [[DESIRE:%.*]], ptr [[TMP1]], align 8
-; PWR7-NEXT:    [[TMP3:%.*]] = alloca i128, align 8
-; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP3]])
-; PWR7-NEXT:    store i128 [[NEW:%.*]], ptr [[TMP3]], align 8
-; PWR7-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 16, ptr [[ADDR:%.*]], ptr [[TMP1]], ptr [[TMP3]], i32 5, i32 5)
-; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP3]])
-; PWR7-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP1]], align 8
+; PWR7-NEXT:    store i128 [[NEW:%.*]], ptr [[TMP1]], align 8
+; PWR7-NEXT:    [[TMP2:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 16, ptr [[ADDR:%.*]], ptr [[TMP0]], ptr [[TMP1]], i32 5, i32 5)
 ; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP1]])
-; PWR7-NEXT:    [[TMP7:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP6]], 0
-; PWR7-NEXT:    [[TMP8:%.*]] = insertvalue { i128, i1 } [[TMP7]], i1 [[TMP5]], 1
-; PWR7-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP8]], 1
+; PWR7-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP0]], align 8
+; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP0]])
+; PWR7-NEXT:    [[TMP4:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP3]], 0
+; PWR7-NEXT:    [[TMP5:%.*]] = insertvalue { i128, i1 } [[TMP4]], i1 [[TMP2]], 1
+; PWR7-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
 ; PWR7-NEXT:    ret i1 [[SUCC]]
 ;
 entry:
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
index 6343a0c697fd38..826b9be7a1e5d3 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
@@ -4,12 +4,12 @@
 
 define i256 @atomic_load256_libcall(ptr %ptr) nounwind {
 ; CHECK-LABEL: @atomic_load256_libcall(
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i256, align 8
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP2]])
-; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[PTR:%.*]], ptr [[TMP2]], i32 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP2]], align 8
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP2]])
-; CHECK-NEXT:    ret i256 [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca i256, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP1]])
+; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i256, ptr [[TMP1]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP1]])
+; CHECK-NEXT:    ret i256 [[TMP2]]
 ;
   %result = load atomic i256, ptr %ptr unordered, align 16
   ret i256 %result
@@ -17,13 +17,13 @@ define i256 @atomic_load256_libcall(ptr %ptr) nounwind {
 
 define i256 @atomic_load256_libcall_as1(ptr addrspace(1) %ptr) nounwind {
 ; CHECK-LABEL: @atomic_load256_libcall_as1(
-; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
-; CHECK-NEXT:    [[TMP3:%.*]] = alloca i256, align 8
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP3]])
-; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[TMP2]], ptr [[TMP3]], i32 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP3]], align 8
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP3]])
-; CHECK-NEXT:    ret i256 [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i256, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP2]])
+; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[TMP1]], ptr [[TMP2]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = load i256, ptr [[TMP2]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP2]])
+; CHECK-NEXT:    ret i256 [[TMP3]]
 ;
   %result = load atomic i256, ptr addrspace(1) %ptr unordered, align 16
   ret i256 %result
diff --git a/llvm/test/Transforms/Attributor/nofpclass-canonicalize.ll b/llvm/test/Transforms/Attributor/nofpclass-canonicalize.ll
index 384b8817f26a35..4d3275c3ab1a23 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-canonicalize.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-canonicalize.ll
@@ -5,7 +5,7 @@ declare float @llvm.canonicalize.f32(float)
 
 define float @ret_canonicalize(float %arg0) {
 ; CHECK-LABEL: define nofpclass(snan) float @ret_canonicalize
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -15,7 +15,7 @@ define float @ret_canonicalize(float %arg0) {
 
 define float @ret_canonicalize_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan inf) float @ret_canonicalize_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan inf) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -25,7 +25,7 @@ define float @ret_canonicalize_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_canonicalize_dynamic_denormal(float nofpclass(inf) %arg0) "denormal-fp-math"="dynamic,dynamic" {
 ; CHECK-LABEL: define nofpclass(snan inf) float @ret_canonicalize_dynamic_denormal
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan inf) [[ARG0:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -35,7 +35,7 @@ define float @ret_canonicalize_dynamic_denormal(float nofpclass(inf) %arg0) "den
 
 define float @ret_canonicalize_daz_denormal(float nofpclass(inf) %arg0) "denormal-fp-math"="dynamic,preserve-sign" {
 ; CHECK-LABEL: define nofpclass(snan inf sub) float @ret_canonicalize_daz_denormal
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan inf sub) [[ARG0:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf sub) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -45,7 +45,7 @@ define float @ret_canonicalize_daz_denormal(float nofpclass(inf) %arg0) "denorma
 
 define float @ret_canonicalize_dapz_denormal(float nofpclass(inf) %arg0) "denormal-fp-math"="dynamic,positive-zero" {
 ; CHECK-LABEL: define nofpclass(snan inf nzero sub) float @ret_canonicalize_dapz_denormal
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan inf nzero sub) [[ARG0:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf nzero sub) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -55,7 +55,7 @@ define float @ret_canonicalize_dapz_denormal(float nofpclass(inf) %arg0) "denorm
 
 define float @ret_canonicalize_ftpz_dapz_denormal(float nofpclass(inf) %arg0) "denormal-fp-math"="positive-zero,preserve-sign" {
 ; CHECK-LABEL: define nofpclass(snan inf sub) float @ret_canonicalize_ftpz_dapz_denormal
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan inf sub) [[ARG0:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf sub) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -65,7 +65,7 @@ define float @ret_canonicalize_ftpz_dapz_denormal(float nofpclass(inf) %arg0) "d
 
 define float @ret_canonicalize_ftpz_ieee_denormal(float nofpclass(inf) %arg0) "denormal-fp-math"="positive-zero,ieee" {
 ; CHECK-LABEL: define nofpclass(snan inf nzero sub) float @ret_canonicalize_ftpz_ieee_denormal
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR6:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan inf nzero sub) [[ARG0:%.*]]) #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf nzero sub) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -75,7 +75,7 @@ define float @ret_canonicalize_ftpz_ieee_denormal(float nofpclass(inf) %arg0) "d
 
 define float @ret_canonicalize_ftpz_dynamic_denormal(float nofpclass(inf) %arg0) "denormal-fp-math"="positive-zero,dynamic" {
 ; CHECK-LABEL: define nofpclass(snan inf sub) float @ret_canonicalize_ftpz_dynamic_denormal
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan inf sub) [[ARG0:%.*]]) #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf sub) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -85,7 +85,7 @@ define float @ret_canonicalize_ftpz_dynamic_denormal(float nofpclass(inf) %arg0)
 
 define float @ret_canonicalize_ftz_denormal(float nofpclass(inf) %arg0) "denormal-fp-math"="preserve-sign,dynamic" {
 ; CHECK-LABEL: define nofpclass(snan inf sub) float @ret_canonicalize_ftz_denormal
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR8:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan inf sub) [[ARG0:%.*]]) #[[ATTR8:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf sub) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -95,7 +95,7 @@ define float @ret_canonicalize_ftz_denormal(float nofpclass(inf) %arg0) "denorma
 
 define float @ret_canonicalize_ieee_denormal(float nofpclass(inf) %arg0) "denormal-fp-math"="ieee,ieee" {
 ; CHECK-LABEL: define nofpclass(snan inf) float @ret_canonicalize_ieee_denormal
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR9:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(snan inf) [[ARG0:%.*]]) #[[ATTR9:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf) float @llvm.canonicalize.f32(float [[ARG0]]) #[[ATTR12]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-ceil.ll b/llvm/test/Transforms/Attributor/nofpclass-ceil.ll
index 04771592c0a8d8..5f639b2e269175 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-ceil.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-ceil.ll
@@ -6,7 +6,7 @@ declare ppc_fp128 @llvm.ceil.ppcf128(ppc_fp128)
 
 define float @ret_ceil(float %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -16,7 +16,7 @@ define float @ret_ceil(float %arg0) {
 
 define float @ret_ceil_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @ret_ceil_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -26,7 +26,7 @@ define float @ret_ceil_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_ceil_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_ceil_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -36,7 +36,7 @@ define float @ret_ceil_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_ceil_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_ceil_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -46,7 +46,7 @@ define float @ret_ceil_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_ceil_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_ceil_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -56,7 +56,7 @@ define float @ret_ceil_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_ceil_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -66,7 +66,7 @@ define float @ret_ceil_noqnan(float nofpclass(qnan) %arg0) {
 
 define float @ret_ceil_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) float @ret_ceil_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -76,7 +76,7 @@ define float @ret_ceil_nosnan(float nofpclass(snan) %arg0) {
 
 define float @ret_ceil_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -86,7 +86,7 @@ define float @ret_ceil_nozero(float nofpclass(zero) %arg0) {
 
 define float @ret_ceil_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -96,7 +96,7 @@ define float @ret_ceil_nopzero(float nofpclass(pzero) %arg0) {
 
 define float @ret_ceil_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -106,7 +106,7 @@ define float @ret_ceil_nonzero(float nofpclass(nzero) %arg0) {
 
 define float @ret_ceil_nonorm(float nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nonorm
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -116,7 +116,7 @@ define float @ret_ceil_nonorm(float nofpclass(norm) %arg0) {
 
 define float @ret_ceil_nonnorm(float nofpclass(nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nonnorm
-; CHECK-SAME: (float nofpclass(nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -126,7 +126,7 @@ define float @ret_ceil_nonnorm(float nofpclass(nnorm) %arg0) {
 
 define float @ret_ceil_nopnorm(float nofpclass(pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nopnorm
-; CHECK-SAME: (float nofpclass(pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -136,7 +136,7 @@ define float @ret_ceil_nopnorm(float nofpclass(pnorm) %arg0) {
 
 define float @ret_ceil_nonsub(float nofpclass(nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nonsub
-; CHECK-SAME: (float nofpclass(nsub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -146,7 +146,7 @@ define float @ret_ceil_nonsub(float nofpclass(nsub) %arg0) {
 
 define float @ret_ceil_nopsub(float nofpclass(psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nopsub
-; CHECK-SAME: (float nofpclass(psub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -166,7 +166,7 @@ define float @ret_ceil_nonorm_nosub(float nofpclass(norm sub) %arg0) {
 
 define float @ret_ceil_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nopnorm_nopsub
-; CHECK-SAME: (float nofpclass(psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -176,7 +176,7 @@ define float @ret_ceil_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 
 define float @ret_ceil_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nonnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -186,7 +186,7 @@ define float @ret_ceil_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 
 define float @ret_ceil_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ceil_nopnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -196,7 +196,7 @@ define float @ret_ceil_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 
 define ppc_fp128 @ret_ceil_ppcf128(ppc_fp128 %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_ceil_ppcf128
-; CHECK-SAME: (ppc_fp128 [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.ceil.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -206,7 +206,7 @@ define ppc_fp128 @ret_ceil_ppcf128(ppc_fp128 %arg0) {
 
 define ppc_fp128 @ret_ceil_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_ceil_noinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.ceil.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -216,7 +216,7 @@ define ppc_fp128 @ret_ceil_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 
 define ppc_fp128 @ret_ceil_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_ceil_nopinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.ceil.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -226,7 +226,7 @@ define ppc_fp128 @ret_ceil_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 
 define ppc_fp128 @ret_ceil_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_ceil_noninf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.ceil.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -236,7 +236,7 @@ define ppc_fp128 @ret_ceil_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 
 define ppc_fp128 @ret_ceil_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) ppc_fp128 @ret_ceil_nonan_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) ppc_fp128 @llvm.ceil.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -246,7 +246,7 @@ define ppc_fp128 @ret_ceil_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 
 define float @ret_ceil_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_ceil_noneg
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -256,7 +256,7 @@ define float @ret_ceil_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 
 define float @ret_ceil_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_ceil_noneg_nonegzero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -266,7 +266,7 @@ define float @ret_ceil_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %a
 
 define float @ret_ceil_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_ceil_noneg_nonegzero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -276,7 +276,7 @@ define float @ret_ceil_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nze
 
 define float @ret_ceil_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_ceil_noneg_nozero
-; CHECK-SAME: (float nofpclass(ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -286,7 +286,7 @@ define float @ret_ceil_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0)
 
 define float @ret_ceil_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_ceil_noneg_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -296,7 +296,7 @@ define float @ret_ceil_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero n
 
 define float @ret_ceil_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_ceil_nopos
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -306,7 +306,7 @@ define float @ret_ceil_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 
 define float @ret_ceil_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_ceil_nopos_nopzero
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -316,7 +316,7 @@ define float @ret_ceil_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg
 
 define float @ret_ceil_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_ceil_nopos_nopzero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -326,7 +326,7 @@ define float @ret_ceil_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero
 
 define float @ret_ceil_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_ceil_nopos_nozero
-; CHECK-SAME: (float nofpclass(pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -336,7 +336,7 @@ define float @ret_ceil_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0)
 
 define float @ret_ceil_nopos_nozero_nonan(float nofpclass(pinf psub pnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_ceil_nopos_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.ceil.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-copysign.ll b/llvm/test/Transforms/Attributor/nofpclass-copysign.ll
index 231f0a42d05658..b893bede188942 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-copysign.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-copysign.ll
@@ -16,7 +16,7 @@ define float @ret_copysign(float %arg0, float %arg1) {
 
 define float @ret_copysign_fabs_rhs(float %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_copysign_fabs_rhs
-; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS_ARG1:%.*]] = call float @llvm.fabs.f32(float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.copysign.f32(float [[ARG0]], float [[FABS_ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
@@ -28,7 +28,7 @@ define float @ret_copysign_fabs_rhs(float %arg0, float %arg1) {
 
 define float @ret_copysign_nnan_lhs_fabs_rhs(float nofpclass(nan) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_copysign_nnan_lhs_fabs_rhs
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS_ARG1:%.*]] = call float @llvm.fabs.f32(float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.copysign.f32(float [[ARG0]], float [[FABS_ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
@@ -41,7 +41,7 @@ define float @ret_copysign_nnan_lhs_fabs_rhs(float nofpclass(nan) %arg0, float %
 
 define float @ret_copysign_lhs_fabs_nnan_rhs(float %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_copysign_lhs_fabs_nnan_rhs
-; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS_ARG1:%.*]] = call nnan float @llvm.fabs.f32(float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.copysign.f32(float [[ARG0]], float [[FABS_ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
@@ -74,7 +74,7 @@ define float @ret_copysign_noneg_rhs(float %arg0, float nofpclass(ninf nnorm nsu
 
 define float @ret_copysign_noneg_nonan_rhs(float %arg0, float nofpclass(ninf nnorm nsub nzero nan) %arg1) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_copysign_noneg_nonan_rhs
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -104,7 +104,7 @@ define float @ret_copysign_nopos_rhs(float %arg0, float nofpclass(pinf pnorm psu
 
 define float @ret_copysign_nonan_lhs(float nofpclass(nan) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_copysign_nonan_lhs
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -124,7 +124,7 @@ define float @ret_copysign_nonan_rhs(float %arg0, float nofpclass(nan) %arg1) {
 
 define float @ret_copysign_noneg_nonan_lhs(float nofpclass(ninf nnorm nsub nzero nan) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_copysign_noneg_nonan_lhs
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -134,7 +134,7 @@ define float @ret_copysign_noneg_nonan_lhs(float nofpclass(ninf nnorm nsub nzero
 
 define float @ret_copysign_nopos_nonan_lhs(float nofpclass(pinf pnorm psub pzero nan) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_copysign_nopos_nonan_lhs
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -164,7 +164,7 @@ define float @ret_copysign_mixed_lhs1(float nofpclass(pinf nnorm psub pzero) %ar
 
 define float @ret_copysign_mixed_lhs2(float nofpclass(ninf pnorm psub nzero qnan) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(qnan) float @ret_copysign_mixed_lhs2
-; CHECK-SAME: (float nofpclass(qnan ninf nzero psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan ninf nzero psub pnorm) [[ARG0:%.*]], float nofpclass(qnan) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(qnan) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -194,7 +194,7 @@ define float @ret_copysign_nopinf_lhs(float nofpclass(pinf) %arg0, float %arg1)
 
 define float @ret_copysign_noinf_lhs(float nofpclass(inf) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_copysign_noinf_lhs
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -224,7 +224,7 @@ define float @ret_copysign_nopzero_lhs(float nofpclass(pzero) %arg0, float %arg1
 
 define float @ret_copysign_nozero_lhs(float nofpclass(zero) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(zero) float @ret_copysign_nozero_lhs
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -254,7 +254,7 @@ define float @ret_copysign_nopsub_lhs(float nofpclass(psub) %arg0, float %arg1)
 
 define float @ret_copysign_nosub_lhs(float nofpclass(sub) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_copysign_nosub_lhs
-; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]], float nofpclass(sub) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -284,7 +284,7 @@ define float @ret_copysign_nopnorm_lhs(float nofpclass(pnorm) %arg0, float %arg1
 
 define float @ret_copysign_nonorm_lhs(float nofpclass(norm) %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(norm) float @ret_copysign_nonorm_lhs
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]], float nofpclass(norm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(norm) float @llvm.copysign.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-exp.ll b/llvm/test/Transforms/Attributor/nofpclass-exp.ll
index e0dd6f38367a8d..96c47da2a40c02 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-exp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-exp.ll
@@ -7,7 +7,7 @@ declare float @llvm.exp10.f32(float)
 
 define float @ret_exp(float %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -17,7 +17,7 @@ define float @ret_exp(float %arg0) {
 
 define float @ret_exp_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -27,7 +27,7 @@ define float @ret_exp_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_exp_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -37,7 +37,7 @@ define float @ret_exp_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_exp_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -47,7 +47,7 @@ define float @ret_exp_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_exp_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -57,7 +57,7 @@ define float @ret_exp_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_exp_nonan_noinf(float nofpclass(nan inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp_nonan_noinf
-; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -67,7 +67,7 @@ define float @ret_exp_nonan_noinf(float nofpclass(nan inf) %arg0) {
 
 define float @ret_exp_nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp_nonan_noinf_nozero
-; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -77,7 +77,7 @@ define float @ret_exp_nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0) {
 
 define float @ret_exp_noinf_nozero(float nofpclass(inf zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp_noinf_nozero
-; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -87,7 +87,7 @@ define float @ret_exp_noinf_nozero(float nofpclass(inf zero) %arg0) {
 
 define float @ret_exp_noinf_nonegzero(float nofpclass(inf nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -110,7 +110,7 @@ define float @ret_exp_positive_source(i32 %arg) {
 ; Could produce a nan because we don't know if the multiply is negative.
 define float @ret_exp_unknown_sign(float nofpclass(nan) %arg0, float nofpclass(nan) %arg1) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp_unknown_sign
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[UNKNOWN_SIGN_NOT_NAN:%.*]] = fmul nnan float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp.f32(float [[UNKNOWN_SIGN_NOT_NAN]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
@@ -122,7 +122,7 @@ define float @ret_exp_unknown_sign(float nofpclass(nan) %arg0, float nofpclass(n
 
 define float @ret_exp2(float %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp2
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -132,7 +132,7 @@ define float @ret_exp2(float %arg0) {
 
 define float @ret_exp2_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp2_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -142,7 +142,7 @@ define float @ret_exp2_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_exp2_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp2_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -152,7 +152,7 @@ define float @ret_exp2_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_exp2_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp2_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -162,7 +162,7 @@ define float @ret_exp2_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_exp2_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp2_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -172,7 +172,7 @@ define float @ret_exp2_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_exp2_nonan_noinf(float nofpclass(nan inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp2_nonan_noinf
-; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -182,7 +182,7 @@ define float @ret_exp2_nonan_noinf(float nofpclass(nan inf) %arg0) {
 
 define float @ret_exp2_nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp2_nonan_noinf_nozero
-; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -192,7 +192,7 @@ define float @ret_exp2_nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0) {
 
 define float @ret_exp2_noinf_nozero(float nofpclass(inf zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp2_noinf_nozero
-; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -202,7 +202,7 @@ define float @ret_exp2_noinf_nozero(float nofpclass(inf zero) %arg0) {
 
 define float @ret_exp2_noinf_nonegzero(float nofpclass(inf nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp2_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -225,7 +225,7 @@ define float @ret_exp2_positive_source(i32 %arg) {
 ; Could produce a nan because we don't know if the multiply is negative.
 define float @ret_exp2_unknown_sign(float nofpclass(nan) %arg0, float nofpclass(nan) %arg1) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp2_unknown_sign
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[UNKNOWN_SIGN_NOT_NAN:%.*]] = fmul nnan float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[UNKNOWN_SIGN_NOT_NAN]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
@@ -237,7 +237,7 @@ define float @ret_exp2_unknown_sign(float nofpclass(nan) %arg0, float nofpclass(
 
 define float @ret_exp10(float %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp10
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -247,7 +247,7 @@ define float @ret_exp10(float %arg0) {
 
 define float @ret_exp10_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp10_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -257,7 +257,7 @@ define float @ret_exp10_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_exp10_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp10_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -267,7 +267,7 @@ define float @ret_exp10_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_exp10_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp10_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -277,7 +277,7 @@ define float @ret_exp10_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_exp10_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp10_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -287,7 +287,7 @@ define float @ret_exp10_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_exp10_nonan_noinf(float nofpclass(nan inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp10_nonan_noinf
-; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -297,7 +297,7 @@ define float @ret_exp10_nonan_noinf(float nofpclass(nan inf) %arg0) {
 
 define float @ret_exp10_nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp10_nonan_noinf_nozero
-; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -307,7 +307,7 @@ define float @ret_exp10_nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0)
 
 define float @ret_exp10_noinf_nozero(float nofpclass(inf zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp10_noinf_nozero
-; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -317,7 +317,7 @@ define float @ret_exp10_noinf_nozero(float nofpclass(inf zero) %arg0) {
 
 define float @ret_exp10_noinf_nonegzero(float nofpclass(inf nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_exp10_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -340,7 +340,7 @@ define float @ret_exp10_positive_source(i32 %arg) {
 ; Could produce a nan because we don't know if the multiply is negative.
 define float @ret_exp10_unknown_sign(float nofpclass(nan) %arg0, float nofpclass(nan) %arg1) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_exp10_unknown_sign
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[UNKNOWN_SIGN_NOT_NAN:%.*]] = fmul nnan float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp10.f32(float [[UNKNOWN_SIGN_NOT_NAN]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
diff --git a/llvm/test/Transforms/Attributor/nofpclass-fdiv.ll b/llvm/test/Transforms/Attributor/nofpclass-fdiv.ll
index 82a10f60e838e1..c1c980df4bd9e8 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-fdiv.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-fdiv.ll
@@ -357,7 +357,7 @@ define float @ret_fdiv_daz_noinf__nozero_nosub(float nofpclass(inf) %arg0, float
 
 define float @ret_fdiv_same_operands(float %arg) #0 {
 ; CHECK-LABEL: define nofpclass(inf zero sub nnorm) float @ret_fdiv_same_operands
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FDIV:%.*]] = fdiv float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FDIV]]
 ;
@@ -367,7 +367,7 @@ define float @ret_fdiv_same_operands(float %arg) #0 {
 
 define float @ret_fdiv_same_operands_nosnan(float nofpclass(snan) %arg) #0 {
 ; CHECK-LABEL: define nofpclass(inf zero sub nnorm) float @ret_fdiv_same_operands_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(snan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FDIV:%.*]] = fdiv float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FDIV]]
 ;
@@ -377,7 +377,7 @@ define float @ret_fdiv_same_operands_nosnan(float nofpclass(snan) %arg) #0 {
 
 define float @ret_fdiv_same_operands_noqnan(float nofpclass(qnan) %arg) #0 {
 ; CHECK-LABEL: define nofpclass(inf zero sub nnorm) float @ret_fdiv_same_operands_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(qnan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FDIV:%.*]] = fdiv float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FDIV]]
 ;
@@ -387,7 +387,7 @@ define float @ret_fdiv_same_operands_noqnan(float nofpclass(qnan) %arg) #0 {
 
 define float @ret_fdiv_same_operands_nonan(float nofpclass(nan) %arg) #0 {
 ; CHECK-LABEL: define nofpclass(inf zero sub nnorm) float @ret_fdiv_same_operands_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FDIV:%.*]] = fdiv float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FDIV]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-floor.ll b/llvm/test/Transforms/Attributor/nofpclass-floor.ll
index 0acece033ab4ce..5b5b1c538b421a 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-floor.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-floor.ll
@@ -6,7 +6,7 @@ declare ppc_fp128 @llvm.floor.ppcf128(ppc_fp128)
 
 define float @ret_floor(float %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -16,7 +16,7 @@ define float @ret_floor(float %arg0) {
 
 define float @ret_floor_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @ret_floor_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -26,7 +26,7 @@ define float @ret_floor_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_floor_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_floor_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -36,7 +36,7 @@ define float @ret_floor_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_floor_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_floor_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -46,7 +46,7 @@ define float @ret_floor_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_floor_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_floor_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -56,7 +56,7 @@ define float @ret_floor_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_floor_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -66,7 +66,7 @@ define float @ret_floor_noqnan(float nofpclass(qnan) %arg0) {
 
 define float @ret_floor_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) float @ret_floor_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -76,7 +76,7 @@ define float @ret_floor_nosnan(float nofpclass(snan) %arg0) {
 
 define float @ret_floor_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -86,7 +86,7 @@ define float @ret_floor_nozero(float nofpclass(zero) %arg0) {
 
 define float @ret_floor_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -96,7 +96,7 @@ define float @ret_floor_nopzero(float nofpclass(pzero) %arg0) {
 
 define float @ret_floor_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -106,7 +106,7 @@ define float @ret_floor_nonzero(float nofpclass(nzero) %arg0) {
 
 define float @ret_floor_nonorm(float nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nonorm
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -116,7 +116,7 @@ define float @ret_floor_nonorm(float nofpclass(norm) %arg0) {
 
 define float @ret_floor_nonnorm(float nofpclass(nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nonnorm
-; CHECK-SAME: (float nofpclass(nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -126,7 +126,7 @@ define float @ret_floor_nonnorm(float nofpclass(nnorm) %arg0) {
 
 define float @ret_floor_nopnorm(float nofpclass(pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nopnorm
-; CHECK-SAME: (float nofpclass(pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -136,7 +136,7 @@ define float @ret_floor_nopnorm(float nofpclass(pnorm) %arg0) {
 
 define float @ret_floor_nonsub(float nofpclass(nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nonsub
-; CHECK-SAME: (float nofpclass(nsub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -146,7 +146,7 @@ define float @ret_floor_nonsub(float nofpclass(nsub) %arg0) {
 
 define float @ret_floor_nopsub(float nofpclass(psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nopsub
-; CHECK-SAME: (float nofpclass(psub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -166,7 +166,7 @@ define float @ret_floor_nonorm_nosub(float nofpclass(norm sub) %arg0) {
 
 define float @ret_floor_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nopnorm_nopsub
-; CHECK-SAME: (float nofpclass(psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -176,7 +176,7 @@ define float @ret_floor_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 
 define float @ret_floor_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nonnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -186,7 +186,7 @@ define float @ret_floor_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 
 define float @ret_floor_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nopnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -196,7 +196,7 @@ define float @ret_floor_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 
 define ppc_fp128 @ret_floor_ppcf128(ppc_fp128 %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_floor_ppcf128
-; CHECK-SAME: (ppc_fp128 [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.floor.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -206,7 +206,7 @@ define ppc_fp128 @ret_floor_ppcf128(ppc_fp128 %arg0) {
 
 define ppc_fp128 @ret_floor_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_floor_noinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.floor.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -216,7 +216,7 @@ define ppc_fp128 @ret_floor_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 
 define ppc_fp128 @ret_floor_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_floor_nopinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.floor.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -226,7 +226,7 @@ define ppc_fp128 @ret_floor_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 
 define ppc_fp128 @ret_floor_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_floor_noninf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.floor.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -236,7 +236,7 @@ define ppc_fp128 @ret_floor_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 
 define ppc_fp128 @ret_floor_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) ppc_fp128 @ret_floor_nonan_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) ppc_fp128 @llvm.floor.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -246,7 +246,7 @@ define ppc_fp128 @ret_floor_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 
 define float @ret_floor_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_floor_noneg
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -256,7 +256,7 @@ define float @ret_floor_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 
 define float @ret_floor_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_floor_noneg_nonegzero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -266,7 +266,7 @@ define float @ret_floor_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %
 
 define float @ret_floor_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_floor_noneg_nonegzero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -276,7 +276,7 @@ define float @ret_floor_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nz
 
 define float @ret_floor_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_floor_noneg_nozero
-; CHECK-SAME: (float nofpclass(ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -286,7 +286,7 @@ define float @ret_floor_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0
 
 define float @ret_floor_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_floor_noneg_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -296,7 +296,7 @@ define float @ret_floor_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero
 
 define float @ret_floor_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_floor_nopos
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -306,7 +306,7 @@ define float @ret_floor_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 
 define float @ret_floor_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_floor_nopos_nopzero
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -316,7 +316,7 @@ define float @ret_floor_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %ar
 
 define float @ret_floor_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_floor_nopos_nopzero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -326,7 +326,7 @@ define float @ret_floor_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzer
 
 define float @ret_floor_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_floor_nopos_nozero
-; CHECK-SAME: (float nofpclass(pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -336,7 +336,7 @@ define float @ret_floor_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0
 
 define float @ret_floor_nopos_nozero_nonan(float nofpclass(pinf psub pnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_floor_nopos_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -346,7 +346,7 @@ define float @ret_floor_nopos_nozero_nonan(float nofpclass(pinf psub pnorm zero
 
 define float @ret_floor_nopzero_nopnorm(float nofpclass(pzero pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nopzero_nopnorm
-; CHECK-SAME: (float nofpclass(pzero pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -356,7 +356,7 @@ define float @ret_floor_nopzero_nopnorm(float nofpclass(pzero pnorm) %arg0) {
 
 define float @ret_floor_nonzero_nonnorm(float nofpclass(nzero nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nonzero_nonnorm
-; CHECK-SAME: (float nofpclass(nzero nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -366,7 +366,7 @@ define float @ret_floor_nonzero_nonnorm(float nofpclass(nzero nnorm) %arg0) {
 
 define float @ret_floor_nozero_nonorm(float nofpclass(zero norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_floor_nozero_nonorm
-; CHECK-SAME: (float nofpclass(zero norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.floor.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-fma.ll b/llvm/test/Transforms/Attributor/nofpclass-fma.ll
index 07a01ae81f3b68..f96fc043016bce 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-fma.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-fma.ll
@@ -6,7 +6,7 @@ declare float @llvm.fmuladd.f32(float, float, float)
 
 define float @ret_fma_same_mul_arg(float %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(nzero) float @ret_fma_same_mul_arg
-; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]], float nofpclass(nzero) [[ARG1:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nzero) float @llvm.fma.f32(float [[ARG0]], float [[ARG0]], float [[ARG1]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -16,7 +16,7 @@ define float @ret_fma_same_mul_arg(float %arg0, float %arg1) {
 
 define float @ret_fma_same_mul_arg_positive_addend(float %arg0, float nofpclass(ninf nsub nnorm) %arg1) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_fma_same_mul_arg_positive_addend
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fma.f32(float [[ARG0]], float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -36,7 +36,7 @@ define float @ret_fma_different_mul_arg_positive_addend(float %arg0, float %arg1
 
 define float @ret_fmuladd_different_same_arg_positive_addend(float %arg0, float nofpclass(ninf nsub nnorm) %arg1) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_fmuladd_different_same_arg_positive_addend
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fmuladd.f32(float [[ARG0]], float [[ARG0]], float [[ARG1]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-fpext.ll b/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
index d2cff4bafc1d4d..0ba114117ceec6 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
@@ -3,7 +3,7 @@
 
 define double @ret_fpext_f32_to_f64(float %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) double @ret_fpext_f32_to_f64
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -13,7 +13,7 @@ define double @ret_fpext_f32_to_f64(float %arg0) {
 
 define <2 x double> @ret_fpext_v2f32_to_v2f64(<2 x float> %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) <2 x double> @ret_fpext_v2f32_to_v2f64
-; CHECK-SAME: (<2 x float> [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (<2 x float> nofpclass(sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext <2 x float> [[ARG0]] to <2 x double>
 ; CHECK-NEXT:    ret <2 x double> [[EXT]]
 ;
@@ -23,7 +23,7 @@ define <2 x double> @ret_fpext_v2f32_to_v2f64(<2 x float> %arg0) {
 
 define double @ret_fpext_f32_to_f64_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) double @ret_fpext_f32_to_f64_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -33,7 +33,7 @@ define double @ret_fpext_f32_to_f64_nonan(float nofpclass(nan) %arg0) {
 
 define <2 x double> @ret_fpext_v2f32_to_v2f64_nonan(<2 x float> nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) <2 x double> @ret_fpext_v2f32_to_v2f64_nonan
-; CHECK-SAME: (<2 x float> nofpclass(nan) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (<2 x float> nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext <2 x float> [[ARG0]] to <2 x double>
 ; CHECK-NEXT:    ret <2 x double> [[EXT]]
 ;
@@ -43,7 +43,7 @@ define <2 x double> @ret_fpext_v2f32_to_v2f64_nonan(<2 x float> nofpclass(nan) %
 
 define double @ret_fpext_f32_to_f64_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(qnan sub) double @ret_fpext_f32_to_f64_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -53,7 +53,7 @@ define double @ret_fpext_f32_to_f64_noqnan(float nofpclass(qnan) %arg0) {
 
 define double @ret_fpext_f32_to_f64_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) double @ret_fpext_f32_to_f64_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -63,7 +63,7 @@ define double @ret_fpext_f32_to_f64_nosnan(float nofpclass(snan) %arg0) {
 
 define double @ret_fpext_f32_to_f64_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) double @ret_fpext_f32_to_f64_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -73,7 +73,7 @@ define double @ret_fpext_f32_to_f64_noinf(float nofpclass(inf) %arg0) {
 
 define double @ret_fpext_f32_to_f64_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) double @ret_fpext_f32_to_f64_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -83,7 +83,7 @@ define double @ret_fpext_f32_to_f64_nopinf(float nofpclass(pinf) %arg0) {
 
 define double @ret_fpext_f32_to_f64_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) double @ret_fpext_f32_to_f64_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -93,7 +93,7 @@ define double @ret_fpext_f32_to_f64_noninf(float nofpclass(ninf) %arg0) {
 
 define double @ret_fpext_f32_to_f64_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(zero sub) double @ret_fpext_f32_to_f64_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -103,7 +103,7 @@ define double @ret_fpext_f32_to_f64_nozero(float nofpclass(zero) %arg0) {
 
 define double @ret_fpext_f32_to_f64_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pzero sub) double @ret_fpext_f32_to_f64_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -113,7 +113,7 @@ define double @ret_fpext_f32_to_f64_nopzero(float nofpclass(pzero) %arg0) {
 
 define double @ret_fpext_f32_to_f64_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(nzero sub) double @ret_fpext_f32_to_f64_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -123,7 +123,7 @@ define double @ret_fpext_f32_to_f64_nonzero(float nofpclass(nzero) %arg0) {
 
 define double @ret_fpext_f32_to_f64_nonan_noinf(float nofpclass(nan inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan inf sub) double @ret_fpext_f32_to_f64_nonan_noinf
-; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nan inf sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -143,7 +143,7 @@ define double @ret_fpext_f32_to_f64_nosub(float nofpclass(sub) %arg0) {
 
 define double @ret_fpext_f32_to_f64_nonorm(float nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_f32_to_f64_nonorm
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -153,7 +153,7 @@ define double @ret_fpext_f32_to_f64_nonorm(float nofpclass(norm) %arg0) {
 
 define double @ret_fpext_f32_to_f64_negonly_zero(float nofpclass(pinf pnorm psub pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) double @ret_fpext_f32_to_f64_negonly_zero
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -163,7 +163,7 @@ define double @ret_fpext_f32_to_f64_negonly_zero(float nofpclass(pinf pnorm psub
 
 define double @ret_fpext_f32_to_f64_negonly(float nofpclass(pinf pnorm psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub pnorm) double @ret_fpext_f32_to_f64_negonly
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -173,7 +173,7 @@ define double @ret_fpext_f32_to_f64_negonly(float nofpclass(pinf pnorm psub) %ar
 
 define double @ret_fpext_f32_to_f64_negonly_ord(float nofpclass(pinf pnorm psub pzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) double @ret_fpext_f32_to_f64_negonly_ord
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -183,7 +183,7 @@ define double @ret_fpext_f32_to_f64_negonly_ord(float nofpclass(pinf pnorm psub
 
 define double @ret_fpext_f32_to_f64_posonly_zero(float nofpclass(ninf nnorm nsub nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) double @ret_fpext_f32_to_f64_posonly_zero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -193,7 +193,7 @@ define double @ret_fpext_f32_to_f64_posonly_zero(float nofpclass(ninf nnorm nsub
 
 define double @ret_fpext_f32_to_f64_posonly(float nofpclass(ninf nnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub nnorm) double @ret_fpext_f32_to_f64_posonly
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -203,7 +203,7 @@ define double @ret_fpext_f32_to_f64_posonly(float nofpclass(ninf nnorm nsub) %ar
 
 define double @ret_fpext_f32_to_f64_posonly_nan(float nofpclass(ninf nnorm nsub nzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) double @ret_fpext_f32_to_f64_posonly_nan
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -363,7 +363,7 @@ define float @ret_fpext_bf16_f32_nonorm(bfloat nofpclass(norm) %arg0) {
 
 define double @ret_fpext_bf16_f64(bfloat %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) double @ret_fpext_bf16_f64
-; CHECK-SAME: (bfloat [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -373,7 +373,7 @@ define double @ret_fpext_bf16_f64(bfloat %arg0) {
 
 define double @ret_fpext_bf16_f64_nonan(bfloat nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) double @ret_fpext_bf16_f64_nonan
-; CHECK-SAME: (bfloat nofpclass(nan) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -383,7 +383,7 @@ define double @ret_fpext_bf16_f64_nonan(bfloat nofpclass(nan) %arg0) {
 
 define double @ret_fpext_bf16_f64_noqnan(bfloat nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(qnan sub) double @ret_fpext_bf16_f64_noqnan
-; CHECK-SAME: (bfloat nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -393,7 +393,7 @@ define double @ret_fpext_bf16_f64_noqnan(bfloat nofpclass(qnan) %arg0) {
 
 define double @ret_fpext_bf16_f64_nosnan(bfloat nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) double @ret_fpext_bf16_f64_nosnan
-; CHECK-SAME: (bfloat nofpclass(snan) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -403,7 +403,7 @@ define double @ret_fpext_bf16_f64_nosnan(bfloat nofpclass(snan) %arg0) {
 
 define double @ret_fpext_bf16_f64_noinf(bfloat nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) double @ret_fpext_bf16_f64_noinf
-; CHECK-SAME: (bfloat nofpclass(inf) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -413,7 +413,7 @@ define double @ret_fpext_bf16_f64_noinf(bfloat nofpclass(inf) %arg0) {
 
 define double @ret_fpext_bf16_f64_nopinf(bfloat nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) double @ret_fpext_bf16_f64_nopinf
-; CHECK-SAME: (bfloat nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -423,7 +423,7 @@ define double @ret_fpext_bf16_f64_nopinf(bfloat nofpclass(pinf) %arg0) {
 
 define double @ret_fpext_bf16_f64_noninf(bfloat nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) double @ret_fpext_bf16_f64_noninf
-; CHECK-SAME: (bfloat nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -433,7 +433,7 @@ define double @ret_fpext_bf16_f64_noninf(bfloat nofpclass(ninf) %arg0) {
 
 define double @ret_fpext_bf16_f64_nozero(bfloat nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(zero sub) double @ret_fpext_bf16_f64_nozero
-; CHECK-SAME: (bfloat nofpclass(zero) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -443,7 +443,7 @@ define double @ret_fpext_bf16_f64_nozero(bfloat nofpclass(zero) %arg0) {
 
 define double @ret_fpext_bf16_f64_nopzero(bfloat nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pzero sub) double @ret_fpext_bf16_f64_nopzero
-; CHECK-SAME: (bfloat nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -453,7 +453,7 @@ define double @ret_fpext_bf16_f64_nopzero(bfloat nofpclass(pzero) %arg0) {
 
 define double @ret_fpext_bf16_f64_nonzero(bfloat nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(nzero sub) double @ret_fpext_bf16_f64_nonzero
-; CHECK-SAME: (bfloat nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -463,7 +463,7 @@ define double @ret_fpext_bf16_f64_nonzero(bfloat nofpclass(nzero) %arg0) {
 
 define double @ret_fpext_bf16_f64_nonan_noinf(bfloat nofpclass(nan inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan inf sub) double @ret_fpext_bf16_f64_nonan_noinf
-; CHECK-SAME: (bfloat nofpclass(nan inf) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(nan inf sub) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
@@ -483,7 +483,7 @@ define double @ret_fpext_bf16_f64_nosub(bfloat nofpclass(sub) %arg0) {
 
 define double @ret_fpext_bf16_f64_nonorm(bfloat nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_bf16_f64_nonorm
-; CHECK-SAME: (bfloat nofpclass(norm) [[ARG0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
 ; CHECK-NEXT:    ret double [[EXT]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-frem.ll b/llvm/test/Transforms/Attributor/nofpclass-frem.ll
index 77b697e4dca5ce..a6708e44ff7d03 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-frem.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-frem.ll
@@ -357,7 +357,7 @@ define float @ret_frem_daz_noinf__nozero_nosub(float nofpclass(inf) %arg0, float
 
 define float @ret_frem_same_operands(float %arg) #0 {
 ; CHECK-LABEL: define nofpclass(inf sub norm) float @ret_frem_same_operands
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(inf sub norm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FREM]]
 ;
@@ -367,7 +367,7 @@ define float @ret_frem_same_operands(float %arg) #0 {
 
 define float @ret_frem_same_operands_nosnan(float nofpclass(snan) %arg) #0 {
 ; CHECK-LABEL: define nofpclass(inf sub norm) float @ret_frem_same_operands_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(snan inf sub norm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FREM]]
 ;
@@ -377,7 +377,7 @@ define float @ret_frem_same_operands_nosnan(float nofpclass(snan) %arg) #0 {
 
 define float @ret_frem_same_operands_noqnan(float nofpclass(qnan) %arg) #0 {
 ; CHECK-LABEL: define nofpclass(inf sub norm) float @ret_frem_same_operands_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(qnan inf sub norm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FREM]]
 ;
@@ -387,7 +387,7 @@ define float @ret_frem_same_operands_noqnan(float nofpclass(qnan) %arg) #0 {
 
 define float @ret_frem_same_operands_nonan(float nofpclass(nan) %arg) #0 {
 ; CHECK-LABEL: define nofpclass(inf sub norm) float @ret_frem_same_operands_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nan inf sub norm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FREM]]
 ;
@@ -397,7 +397,7 @@ define float @ret_frem_same_operands_nonan(float nofpclass(nan) %arg) #0 {
 
 define float @ret_frem_no_neg_lhs(float nofpclass(ninf nsub nnorm) %arg0, float %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_frem_no_neg_lhs
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
 ;
@@ -467,7 +467,7 @@ define float @ret_frem_no_neg_no_zero_rhs(float nofpclass(ninf nsub nnorm nzero)
 
 define float @ret_frem_no_pos_lhs(float nofpclass(pinf psub pnorm pzero) %arg0, float %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_frem_no_pos_lhs
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]], float nofpclass(pinf pzero psub pnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
 ;
@@ -487,7 +487,7 @@ define float @ret_frem_no_pos_rhs(float %arg0, float nofpclass(pinf psub pnorm p
 
 define float @ret_frem_no_pos_zero_lhs(float nofpclass(pinf psub pnorm) %arg0, float %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(pinf psub pnorm) float @ret_frem_no_pos_zero_lhs
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-frexp.ll b/llvm/test/Transforms/Attributor/nofpclass-frexp.ll
index a8794eb3098ac4..5294b79c5559e0 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-frexp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-frexp.ll
@@ -27,7 +27,7 @@ define { float, i32 } @ret_frexp_f32_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_frexp_f32_0_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_frexp_f32_0_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -39,7 +39,7 @@ define float @ret_frexp_f32_0_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_frexp_f32_0_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) float @ret_frexp_f32_0_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -51,7 +51,7 @@ define float @ret_frexp_f32_0_nosnan(float nofpclass(snan) %arg0) {
 
 define float @ret_frexp_f32_0_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_frexp_f32_0_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -75,7 +75,7 @@ define i32 @ret_frexp_f32_1_nonan(float nofpclass(nan) %arg0) {
 
 define <2 x float> @ret_frexp_v2f32_0_nonan(<2 x float> nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) <2 x float> @ret_frexp_v2f32_0_nonan
-; CHECK-SAME: (<2 x float> nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (<2 x float> nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { <2 x float>, <2 x i32> } [[CALL]], 0
 ; CHECK-NEXT:    ret <2 x float> [[CALL_0]]
@@ -99,7 +99,7 @@ define <2 x i32> @ret_frexp_v2f32_1_nonan(<2 x float> nofpclass(nan) %arg0) {
 
 define float @ret_frexp_v4f32_0_nonan_elt1(<4 x float> nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_frexp_v4f32_0_nonan_elt1
-; CHECK-SAME: (<4 x float> nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (<4 x float> nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[CALL]], 0
 ; CHECK-NEXT:    [[ELT_2:%.*]] = extractelement <4 x float> [[CALL_0]], i32 2
@@ -113,7 +113,7 @@ define float @ret_frexp_v4f32_0_nonan_elt1(<4 x float> nofpclass(nan) %arg0) {
 
 define float @ret_frexp_f32_0_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_frexp_f32_0_nopos_nopzero
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -125,7 +125,7 @@ define float @ret_frexp_f32_0_nopos_nopzero(float nofpclass(pinf psub pnorm pzer
 
 define float @ret_frexp_f32_0_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_frexp_f32_0_nopos_nopzero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -137,7 +137,7 @@ define float @ret_frexp_f32_0_nopos_nopzero_nonan(float nofpclass(pinf psub pnor
 
 define float @ret_frexp_f32_0_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_frexp_f32_0_nopos
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -149,7 +149,7 @@ define float @ret_frexp_f32_0_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 
 define float @ret_frexp_f32_0_nopos_nonan(float nofpclass(pinf psub pnorm nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf sub) float @ret_frexp_f32_0_nopos_nonan
-; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -161,7 +161,7 @@ define float @ret_frexp_f32_0_nopos_nonan(float nofpclass(pinf psub pnorm nan) %
 
 define float @ret_frexp_f32_0_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf zero sub pnorm) float @ret_frexp_f32_0_nopos_nozero
-; CHECK-SAME: (float nofpclass(pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -173,7 +173,7 @@ define float @ret_frexp_f32_0_nopos_nozero(float nofpclass(pinf psub pnorm zero)
 
 define float @ret_frexp_f32_0_noneg_nonzero(float nofpclass(ninf nsub nnorm nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_frexp_f32_0_noneg_nonzero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -185,7 +185,7 @@ define float @ret_frexp_f32_0_noneg_nonzero(float nofpclass(ninf nsub nnorm nzer
 
 define float @ret_frexp_f32_0_noneg_nonzero_nonan(float nofpclass(ninf nsub nnorm nzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_frexp_f32_0_noneg_nonzero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -197,7 +197,7 @@ define float @ret_frexp_f32_0_noneg_nonzero_nonan(float nofpclass(ninf nsub nnor
 
 define float @ret_frexp_f32_0_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_frexp_f32_0_noneg
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -209,7 +209,7 @@ define float @ret_frexp_f32_0_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 
 define float @ret_frexp_f32_0_noneg_nonan(float nofpclass(ninf nsub nnorm nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf sub) float @ret_frexp_f32_0_noneg_nonan
-; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -221,7 +221,7 @@ define float @ret_frexp_f32_0_noneg_nonan(float nofpclass(ninf nsub nnorm nan) %
 
 define float @ret_frexp_f32_0_noneg_nozero(float nofpclass(ninf nsub nnorm nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_frexp_f32_0_noneg_nozero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -233,7 +233,7 @@ define float @ret_frexp_f32_0_noneg_nozero(float nofpclass(ninf nsub nnorm nzero
 
 define float @ret_frexp_f32_0_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pzero sub) float @ret_frexp_f32_0_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -245,7 +245,7 @@ define float @ret_frexp_f32_0_nopzero(float nofpclass(pzero) %arg0) {
 
 define float @ret_frexp_f32_0_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(nzero sub) float @ret_frexp_f32_0_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -257,7 +257,7 @@ define float @ret_frexp_f32_0_nonzero(float nofpclass(nzero) %arg0) {
 
 define float @ret_frexp_f32_0_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(zero sub) float @ret_frexp_f32_0_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -269,7 +269,7 @@ define float @ret_frexp_f32_0_nozero(float nofpclass(zero) %arg0) {
 
 define float @ret_frexp_f32_0_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_frexp_f32_0_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -281,7 +281,7 @@ define float @ret_frexp_f32_0_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_frexp_f32_0_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_frexp_f32_0_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -293,7 +293,7 @@ define float @ret_frexp_f32_0_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_frexp_f32_0_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @ret_frexp_f32_0_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -305,7 +305,7 @@ define float @ret_frexp_f32_0_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_frexp_f32_0_nozero_nonan(float nofpclass(zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan zero sub) float @ret_frexp_f32_0_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -317,7 +317,7 @@ define float @ret_frexp_f32_0_nozero_nonan(float nofpclass(zero nan) %arg0) {
 
 define float @ret_frexp_f32_0_nozero_noinf(float nofpclass(zero inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf zero sub) float @ret_frexp_f32_0_nozero_noinf
-; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -329,7 +329,7 @@ define float @ret_frexp_f32_0_nozero_noinf(float nofpclass(zero inf) %arg0) {
 
 define float @ret_frexp_f32_0_nozero_nonan_noinf(float nofpclass(zero nan inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan inf zero sub) float @ret_frexp_f32_0_nozero_nonan_noinf
-; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -341,7 +341,7 @@ define float @ret_frexp_f32_0_nozero_nonan_noinf(float nofpclass(zero nan inf) %
 
 define float @ret_frexp_f32_0_nonzero_ftz_daz(float nofpclass(nzero) %arg0) #1 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_frexp_f32_0_nonzero_ftz_daz
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -353,7 +353,7 @@ define float @ret_frexp_f32_0_nonzero_ftz_daz(float nofpclass(nzero) %arg0) #1 {
 
 define float @ret_frexp_f32_0_nonzero_ftpz_dapz(float nofpclass(nzero) %arg0) #2 {
 ; CHECK-LABEL: define nofpclass(nzero sub) float @ret_frexp_f32_0_nonzero_ftpz_dapz
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -365,7 +365,7 @@ define float @ret_frexp_f32_0_nonzero_ftpz_dapz(float nofpclass(nzero) %arg0) #2
 
 define float @ret_frexp_f32_0_nonzero_dynamic_dynamic(float nofpclass(nzero) %arg0) #3 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_frexp_f32_0_nonzero_dynamic_dynamic
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -377,7 +377,7 @@ define float @ret_frexp_f32_0_nonzero_dynamic_dynamic(float nofpclass(nzero) %ar
 
 define float @ret_frexp_f32_0_nonzero_ieee_daz(float nofpclass(nzero) %arg0) #4 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_frexp_f32_0_nonzero_ieee_daz
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -389,7 +389,7 @@ define float @ret_frexp_f32_0_nonzero_ieee_daz(float nofpclass(nzero) %arg0) #4
 
 define float @ret_frexp_f32_0_nonzero_daz_ieee(float nofpclass(nzero) %arg0) #5 {
 ; CHECK-LABEL: define nofpclass(nzero sub) float @ret_frexp_f32_0_nonzero_daz_ieee
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR6:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -401,7 +401,7 @@ define float @ret_frexp_f32_0_nonzero_daz_ieee(float nofpclass(nzero) %arg0) #5
 
 define float @ret_frexp_f32_0_nopzero_ftz_daz(float nofpclass(pzero) %arg0) #1 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_frexp_f32_0_nopzero_ftz_daz
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -413,7 +413,7 @@ define float @ret_frexp_f32_0_nopzero_ftz_daz(float nofpclass(pzero) %arg0) #1 {
 
 define float @ret_frexp_f32_0_nopzero_ftpz_dapz(float nofpclass(pzero) %arg0) #2 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_frexp_f32_0_nopzero_ftpz_dapz
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -425,7 +425,7 @@ define float @ret_frexp_f32_0_nopzero_ftpz_dapz(float nofpclass(pzero) %arg0) #2
 
 define float @ret_frexp_f32_0_nopzero_dynamic_dynamic(float nofpclass(pzero) %arg0) #3 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_frexp_f32_0_nopzero_dynamic_dynamic
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -437,7 +437,7 @@ define float @ret_frexp_f32_0_nopzero_dynamic_dynamic(float nofpclass(pzero) %ar
 
 define float @ret_frexp_f32_0_nopzero_ieee_daz(float nofpclass(pzero) %arg0) #4 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_frexp_f32_0_nopzero_ieee_daz
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR5]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR5]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
@@ -449,7 +449,7 @@ define float @ret_frexp_f32_0_nopzero_ieee_daz(float nofpclass(pzero) %arg0) #4
 
 define float @ret_frexp_f32_0_nopzero_daz_ieee(float nofpclass(pzero) %arg0) #5 {
 ; CHECK-LABEL: define nofpclass(pzero sub) float @ret_frexp_f32_0_nopzero_daz_ieee
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR6]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR6]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[ARG0]]) #[[ATTR7]]
 ; CHECK-NEXT:    [[CALL_0:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    ret float [[CALL_0]]
diff --git a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
index 396b8c84fc898c..ea594398c58014 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
@@ -465,7 +465,7 @@ define float @if_fcmp_oeq_1_0_else_arg(float %arg) {
 
 define float @if_fcmp_oeq_0_1_else_arg(float %arg) {
 ; CHECK-LABEL: define nofpclass(zero) float @if_fcmp_oeq_0_1_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(zero) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OEQ_0:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OEQ_0]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -477,7 +477,7 @@ define float @if_fcmp_oeq_0_1_else_arg(float %arg) {
 
 define float @if_fcmp_one_0_arg_else_1(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan zero) float @if_fcmp_one_0_arg_else_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(nan zero) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ONE_0:%.*]] = fcmp one float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ONE_0]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -489,7 +489,7 @@ define float @if_fcmp_one_0_arg_else_1(float %arg) {
 
 define float @if_fcmp_une_0_arg_else_1(float %arg) {
 ; CHECK-LABEL: define nofpclass(zero) float @if_fcmp_une_0_arg_else_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(zero) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UNE_0:%.*]] = fcmp une float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UNE_0]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -501,7 +501,7 @@ define float @if_fcmp_une_0_arg_else_1(float %arg) {
 
 define float @if_fcmp_one_0_1_else_arg(float %arg) {
 ; CHECK-LABEL: define nofpclass(inf sub nnorm) float @if_fcmp_one_0_1_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(inf sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ONE_0:%.*]] = fcmp one float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ONE_0]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -524,8 +524,8 @@ define float @if_fcmp_one_1_arg_else_0(float %arg) {
 }
 
 define float @fcmp_fabs_oeq_1_else_1(float %arg) {
-; CHECK-LABEL: define float @fcmp_fabs_oeq_1_else_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_oeq_1_else_1(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[FABS_IS_OEQ_1:%.*]] = fcmp oeq float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OEQ_1]], float [[ARG]], float 1.000000e+00
@@ -538,8 +538,8 @@ define float @fcmp_fabs_oeq_1_else_1(float %arg) {
 }
 
 define float @fcmp_fabs_oeq_1_0_else_arg(float %arg) {
-; CHECK-LABEL: define float @fcmp_fabs_oeq_1_0_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_oeq_1_0_else_arg(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OEQ_1:%.*]] = fcmp oeq float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OEQ_1]], float 0.000000e+00, float [[ARG]]
@@ -552,8 +552,8 @@ define float @fcmp_fabs_oeq_1_0_else_arg(float %arg) {
 }
 
 define float @fcmp_fabs_ueq_1_0_else_arg(float %arg) {
-; CHECK-LABEL: define float @fcmp_fabs_ueq_1_0_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_ueq_1_0_else_arg(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UEQ_1:%.*]] = fcmp ueq float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UEQ_1]], float 0.000000e+00, float [[ARG]]
@@ -566,8 +566,8 @@ define float @fcmp_fabs_ueq_1_0_else_arg(float %arg) {
 }
 
 define float @fcmp_fabs_one_1_arg_else_0(float %arg) {
-; CHECK-LABEL: define float @fcmp_fabs_one_1_arg_else_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_one_1_arg_else_0(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ONE_1:%.*]] = fcmp one float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ONE_1]], float [[ARG]], float 0.000000e+00
@@ -580,8 +580,8 @@ define float @fcmp_fabs_one_1_arg_else_0(float %arg) {
 }
 
 define float @fcmp_fabs_une_1_arg_else_0(float %arg) {
-; CHECK-LABEL: define float @fcmp_fabs_une_1_arg_else_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_une_1_arg_else_0(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UNE_1:%.*]] = fcmp une float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UNE_1]], float [[ARG]], float 0.000000e+00
@@ -594,8 +594,8 @@ define float @fcmp_fabs_une_1_arg_else_0(float %arg) {
 }
 
 define float @fcmp_fabs_one_1_0_else_arg(float %arg) {
-; CHECK-LABEL: define float @fcmp_fabs_one_1_0_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_one_1_0_else_arg(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ONE_1:%.*]] = fcmp one float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ONE_1]], float 0.000000e+00, float [[ARG]]
@@ -608,8 +608,8 @@ define float @fcmp_fabs_one_1_0_else_arg(float %arg) {
 }
 
 define float @fcmp_fabs_une_1_0_else_arg(float %arg) {
-; CHECK-LABEL: define float @fcmp_fabs_une_1_0_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_une_1_0_else_arg(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UNE_1:%.*]] = fcmp une float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UNE_1]], float 0.000000e+00, float [[ARG]]
@@ -622,8 +622,8 @@ define float @fcmp_fabs_une_1_0_else_arg(float %arg) {
 }
 
 define float @fcmp_fabs_one_1_neg2_else_arg(float %arg) {
-; CHECK-LABEL: define float @fcmp_fabs_one_1_neg2_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @fcmp_fabs_one_1_neg2_else_arg(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ONE_1:%.*]] = fcmp one float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ONE_1]], float -2.000000e+00, float [[ARG]]
@@ -753,11 +753,11 @@ define float @fcmp_oeq_largest_denormal_arg_else_0.0(float %arg) {
 
 ; can't be inf
 define float @clamp_fabs_value_ogt_1_to_1_copysign(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_value_ogt_1_to_1_copysign(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @clamp_fabs_value_ogt_1_to_1_copysign(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OGT_1:%.*]] = fcmp ogt float [[FABS_ARG]], 1.000000e+00
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nofpclass(nan inf zero sub) float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OGT_1]], float [[COPYSIGN]], float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
@@ -770,11 +770,11 @@ define float @clamp_fabs_value_ogt_1_to_1_copysign(float %arg) {
 
 ; can't be inf
 define float @clamp_fabs_value_oge_1_to_1_copysign(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_value_oge_1_to_1_copysign(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @clamp_fabs_value_oge_1_to_1_copysign(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OGE_1:%.*]] = fcmp oge float [[FABS_ARG]], 1.000000e+00
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nofpclass(nan inf zero sub) float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OGE_1]], float [[COPYSIGN]], float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
@@ -787,8 +787,8 @@ define float @clamp_fabs_value_oge_1_to_1_copysign(float %arg) {
 
 ; can't be inf or nan
 define float @clamp_fabs_value_olt_1_to_1_copysign(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_value_olt_1_to_1_copysign(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @clamp_fabs_value_olt_1_to_1_copysign(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OLT_1:%.*]] = fcmp olt float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nofpclass(nan inf zero sub) float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
@@ -804,8 +804,8 @@ define float @clamp_fabs_value_olt_1_to_1_copysign(float %arg) {
 
 ; can't be inf or nan
 define float @clamp_fabs_value_ole_1_to_1_copysign(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_value_ole_1_to_1_copysign(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @clamp_fabs_value_ole_1_to_1_copysign(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OLE_1:%.*]] = fcmp ole float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nofpclass(nan inf zero sub) float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
@@ -821,11 +821,11 @@ define float @clamp_fabs_value_ole_1_to_1_copysign(float %arg) {
 
 ; can't be inf or nan
 define float @clamp_fabs_value_ugt_1_to_1_copysign(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_value_ugt_1_to_1_copysign(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @clamp_fabs_value_ugt_1_to_1_copysign(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UGT_1:%.*]] = fcmp ugt float [[FABS_ARG]], 1.000000e+00
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nofpclass(nan inf zero sub) float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UGT_1]], float [[COPYSIGN]], float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
@@ -838,11 +838,11 @@ define float @clamp_fabs_value_ugt_1_to_1_copysign(float %arg) {
 
 ; can't be inf or nan
 define float @clamp_fabs_value_uge_1_to_1_copysign(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_value_uge_1_to_1_copysign(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @clamp_fabs_value_uge_1_to_1_copysign(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UGE_1:%.*]] = fcmp ugt float [[FABS_ARG]], 1.000000e+00
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nofpclass(nan inf zero sub) float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UGE_1]], float [[COPYSIGN]], float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
@@ -855,8 +855,8 @@ define float @clamp_fabs_value_uge_1_to_1_copysign(float %arg) {
 
 ; can't be inf
 define float @clamp_fabs_value_ult_1_to_1_copysign(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_value_ult_1_to_1_copysign(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @clamp_fabs_value_ult_1_to_1_copysign(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ULT_1:%.*]] = fcmp ult float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nofpclass(nan inf zero sub) float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
@@ -872,8 +872,8 @@ define float @clamp_fabs_value_ult_1_to_1_copysign(float %arg) {
 
 ; can't be inf
 define float @clamp_fabs_value_ule_1_to_1_copysign(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_value_ule_1_to_1_copysign(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @clamp_fabs_value_ule_1_to_1_copysign(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ULE_1:%.*]] = fcmp ule float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nofpclass(nan inf zero sub) float @llvm.copysign.f32(float noundef 1.000000e+00, float [[ARG]]) #[[ATTR4]]
@@ -945,8 +945,8 @@ define float @clamp_is_uge_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or -inf
 define float @clamp_fabs_is_ogt_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_is_ogt_largest_normal_to_largest_normal(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_ogt_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGT_LARGEST_NORMAL:%.*]] = fcmp ogt float [[FABS_ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
@@ -960,8 +960,8 @@ define float @clamp_fabs_is_ogt_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or -inf
 define float @clamp_fabs_is_oge_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_is_oge_largest_normal_to_largest_normal(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_oge_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGE_LARGEST_NORMAL:%.*]] = fcmp oge float [[FABS_ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
@@ -975,8 +975,8 @@ define float @clamp_fabs_is_oge_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or -inf or nan
 define float @clamp_fabs_is_ugt_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_is_ugt_largest_normal_to_largest_normal(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_ugt_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UGT_LARGEST_NORMAL:%.*]] = fcmp ugt float [[FABS_ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
@@ -990,8 +990,8 @@ define float @clamp_fabs_is_ugt_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or -inf or nan
 define float @clamp_fabs_is_uge_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_is_uge_largest_normal_to_largest_normal(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_uge_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UGT_LARGEST_NORMAL:%.*]] = fcmp uge float [[FABS_ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
@@ -1009,8 +1009,8 @@ define float @clamp_fabs_is_uge_largest_normal_to_largest_normal(float %arg) {
 
 ; can't be negative or positive subnormal
 define float @clamp_fabs_ogt_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_ogt_smallest_normal_to_zero(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_ogt_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGT_SMALLEST_NORMAL:%.*]] = fcmp ogt float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1025,8 +1025,8 @@ define float @clamp_fabs_ogt_smallest_normal_to_zero(float %arg) {
 
 ; can't be negative or positive subnormal
 define float @clamp_fabs_oge_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define nofpclass(inf norm) float @clamp_fabs_oge_smallest_normal_to_zero(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(inf nzero nsub norm) float @clamp_fabs_oge_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(inf nzero nsub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGE_SMALLEST_NORMAL:%.*]] = fcmp oge float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1040,8 +1040,8 @@ define float @clamp_fabs_oge_smallest_normal_to_zero(float %arg) {
 
 ; can't be negative or subnormal
 define float @clamp_fabs_olt_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define nofpclass(nzero sub) float @clamp_fabs_olt_smallest_normal_to_zero(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_fabs_olt_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OLT_SMALLEST_NORMAL:%.*]] = fcmp olt float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLT_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1055,8 +1055,8 @@ define float @clamp_fabs_olt_smallest_normal_to_zero(float %arg) {
 
 ; can't be negative or subnormal
 define float @clamp_fabs_ole_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_ole_smallest_normal_to_zero(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_ole_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OLE_SMALLEST_NORMAL:%.*]] = fcmp ole float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1069,8 +1069,8 @@ define float @clamp_fabs_ole_smallest_normal_to_zero(float %arg) {
 }
 
 define float @clamp_fabs_is_is_olt_smallest_normal_to_0(float %arg) {
-; CHECK-LABEL: define nofpclass(nzero sub) float @clamp_fabs_is_is_olt_smallest_normal_to_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_fabs_is_is_olt_smallest_normal_to_0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OLT_SMALLEST_NORMAL:%.*]] = fcmp olt float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLT_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1083,8 +1083,8 @@ define float @clamp_fabs_is_is_olt_smallest_normal_to_0(float %arg) {
 }
 
 define float @clamp_fabs_is_is_ole_smallest_normal_to_0(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_is_is_ole_smallest_normal_to_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_is_ole_smallest_normal_to_0(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OLE_SMALLEST_NORMAL:%.*]] = fcmp ole float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1097,8 +1097,8 @@ define float @clamp_fabs_is_is_ole_smallest_normal_to_0(float %arg) {
 }
 
 define float @clamp_fabs_oeq_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_oeq_smallest_normal_to_zero(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_oeq_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OEQ_SMALLEST_NORMAL:%.*]] = fcmp oeq float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OEQ_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1111,8 +1111,8 @@ define float @clamp_fabs_oeq_smallest_normal_to_zero(float %arg) {
 }
 
 define float @clamp_fabs_one_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_one_smallest_normal_to_zero(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_one_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_ONE_SMALLEST_NORMAL:%.*]] = fcmp one float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ONE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1125,8 +1125,8 @@ define float @clamp_fabs_one_smallest_normal_to_zero(float %arg) {
 }
 
 define float @clamp_fabs_ueq_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_ueq_smallest_normal_to_zero(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_ueq_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UEQ_SMALLEST_NORMAL:%.*]] = fcmp ueq float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UEQ_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1139,8 +1139,8 @@ define float @clamp_fabs_ueq_smallest_normal_to_zero(float %arg) {
 }
 
 define float @clamp_fabs_une_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_une_smallest_normal_to_zero(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_une_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UNE_SMALLEST_NORMAL:%.*]] = fcmp une float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UNE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1157,8 +1157,8 @@ define float @clamp_fabs_une_smallest_normal_to_zero(float %arg) {
 ;---------------------------------------------------------------------
 
 define float @clamp_fabs_olt_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_olt_neg1_to_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_olt_neg1_to_neg1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1168,8 +1168,8 @@ define float @clamp_fabs_olt_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ole_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_ole_neg1_to_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_ole_neg1_to_neg1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1179,8 +1179,8 @@ define float @clamp_fabs_ole_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ult_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_ult_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_ult_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ULT_NEG1:%.*]] = fcmp ult float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ULT_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1193,8 +1193,8 @@ define float @clamp_fabs_ult_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ule_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_ule_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_ule_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ULE_NEG1:%.*]] = fcmp ule float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ULE_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1207,8 +1207,8 @@ define float @clamp_fabs_ule_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ogt_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_ogt_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_ogt_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OGT_NEG1:%.*]] = fcmp ogt float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OGT_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1221,8 +1221,8 @@ define float @clamp_fabs_ogt_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_oge_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_oge_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_oge_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OGE_NEG1:%.*]] = fcmp oge float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OGE_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1236,7 +1236,7 @@ define float @clamp_fabs_oge_neg1_to_neg1(float %arg) {
 
 define float @clamp_fabs_ugt_neg1_to_neg1(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf zero sub pnorm) float @clamp_fabs_ugt_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float -1.000000e+00
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1247,7 +1247,7 @@ define float @clamp_fabs_ugt_neg1_to_neg1(float %arg) {
 
 define float @clamp_fabs_uge_neg1_to_neg1(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf zero sub pnorm) float @clamp_fabs_uge_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float -1.000000e+00
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1257,8 +1257,8 @@ define float @clamp_fabs_uge_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_oeq_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_oeq_neg1_to_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_oeq_neg1_to_neg1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1268,8 +1268,8 @@ define float @clamp_fabs_oeq_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ueq_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_ueq_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_ueq_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UEQ_NEG1:%.*]] = fcmp ueq float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UEQ_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1282,8 +1282,8 @@ define float @clamp_fabs_ueq_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_one_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_fabs_one_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_one_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ONE_NEG1:%.*]] = fcmp one float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ONE_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1297,7 +1297,7 @@ define float @clamp_fabs_one_neg1_to_neg1(float %arg) {
 
 define float @clamp_fabs_une_neg1_to_neg1(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf zero sub pnorm) float @clamp_fabs_une_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float -1.000000e+00
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1407,8 +1407,8 @@ define float @ret_assumed_ule_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ogt_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_ogt_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ogt_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OGT_1:%.*]] = fcmp ogt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGT_1]]) #[[ATTR5]]
@@ -1421,8 +1421,8 @@ define float @ret_assumed_fabs_ogt_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_oge_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_oge_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_oge_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OGE_1:%.*]] = fcmp oge float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGE_1]]) #[[ATTR5]]
@@ -1435,8 +1435,8 @@ define float @ret_assumed_fabs_oge_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_olt_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_olt_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_olt_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OLT_1:%.*]] = fcmp olt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OLT_1]]) #[[ATTR5]]
@@ -1449,8 +1449,8 @@ define float @ret_assumed_fabs_olt_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ole_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_ole_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ole_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OLE_1:%.*]] = fcmp olt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OLE_1]]) #[[ATTR5]]
@@ -1463,8 +1463,8 @@ define float @ret_assumed_fabs_ole_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ugt_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_ugt_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ugt_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UGT_1:%.*]] = fcmp ugt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGT_1]]) #[[ATTR5]]
@@ -1477,8 +1477,8 @@ define float @ret_assumed_fabs_ugt_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_uge_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_uge_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_uge_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UGE_1:%.*]] = fcmp ugt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGE_1]]) #[[ATTR5]]
@@ -1491,8 +1491,8 @@ define float @ret_assumed_fabs_uge_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ult_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_ult_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ult_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ULT_1:%.*]] = fcmp ult float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ULT_1]]) #[[ATTR5]]
@@ -1505,8 +1505,8 @@ define float @ret_assumed_fabs_ult_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ule_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_ule_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ule_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ULE_1:%.*]] = fcmp ule float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ULE_1]]) #[[ATTR5]]
@@ -1687,8 +1687,8 @@ define float @ret_assumed_une_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_oeq_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_oeq_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_oeq_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OEQ_1:%.*]] = fcmp oeq float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OEQ_1]]) #[[ATTR5]]
@@ -1701,8 +1701,8 @@ define float @ret_assumed_fabs_oeq_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ueq_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_ueq_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ueq_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UEQ_1:%.*]] = fcmp ueq float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UEQ_1]]) #[[ATTR5]]
@@ -1715,8 +1715,8 @@ define float @ret_assumed_fabs_ueq_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_one_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_one_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_one_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ONE_1:%.*]] = fcmp one float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ONE_1]]) #[[ATTR5]]
@@ -1729,8 +1729,8 @@ define float @ret_assumed_fabs_one_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_une_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_une_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_une_1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UNE_1:%.*]] = fcmp one float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UNE_1]]) #[[ATTR5]]
@@ -1743,8 +1743,8 @@ define float @ret_assumed_fabs_une_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_oeq_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_oeq_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_oeq_neg1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -1755,8 +1755,8 @@ define float @ret_assumed_fabs_oeq_neg1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ueq_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_ueq_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ueq_neg1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UEQ_NEG1:%.*]] = fcmp ueq float [[ARG_FABS]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UEQ_NEG1]]) #[[ATTR5]]
@@ -1769,8 +1769,8 @@ define float @ret_assumed_fabs_ueq_neg1(float %arg) {
 }
 
 define float @ret_assumed_fabs_one_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_one_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_one_neg1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ONE_NEG1:%.*]] = fcmp one float [[ARG_FABS]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ONE_NEG1]]) #[[ATTR5]]
@@ -1783,8 +1783,8 @@ define float @ret_assumed_fabs_one_neg1(float %arg) {
 }
 
 define float @ret_assumed_fabs_une_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_fabs_une_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_une_neg1(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -1801,7 +1801,7 @@ define float @ret_assumed_fabs_une_neg1(float %arg) {
 ; Can't be +inf
 define float @clamp_is_ogt_known_positive_to_1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ogt_known_positive_to_1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGT_KNOWN_POSITIVE:%.*]] = fcmp ogt float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_KNOWN_POSITIVE]], float 1.000000e+00, float [[ARG]]
@@ -1815,7 +1815,7 @@ define float @clamp_is_ogt_known_positive_to_1(float %arg, float %unknown) {
 
 define float @clamp_is_oge_known_positive_to_1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_oge_known_positive_to_1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGE_KNOWN_POSITIVE:%.*]] = fcmp oge float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_KNOWN_POSITIVE]], float 1.000000e+00, float [[ARG]]
@@ -1829,7 +1829,7 @@ define float @clamp_is_oge_known_positive_to_1(float %arg, float %unknown) {
 
 define float @clamp_is_olt_known_positive_to_1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_olt_known_positive_to_1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OLT_KNOWN_POSITIVE:%.*]] = fcmp olt float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLT_KNOWN_POSITIVE]], float 1.000000e+00, float [[ARG]]
@@ -1843,7 +1843,7 @@ define float @clamp_is_olt_known_positive_to_1(float %arg, float %unknown) {
 
 define float @clamp_is_ole_known_positive_to_1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ole_known_positive_to_1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OLE_KNOWN_POSITIVE:%.*]] = fcmp olt float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_KNOWN_POSITIVE]], float 1.000000e+00, float [[ARG]]
@@ -1857,7 +1857,7 @@ define float @clamp_is_ole_known_positive_to_1(float %arg, float %unknown) {
 
 define float @clamp_is_ugt_known_positive_to_1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ugt_known_positive_to_1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UGT_KNOWN_POSITIVE:%.*]] = fcmp ugt float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_KNOWN_POSITIVE]], float 1.000000e+00, float [[ARG]]
@@ -1871,7 +1871,7 @@ define float @clamp_is_ugt_known_positive_to_1(float %arg, float %unknown) {
 
 define float @clamp_is_uge_known_positive_to_1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_uge_known_positive_to_1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UGE_KNOWN_POSITIVE:%.*]] = fcmp uge float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGE_KNOWN_POSITIVE]], float 1.000000e+00, float [[ARG]]
@@ -1885,7 +1885,7 @@ define float @clamp_is_uge_known_positive_to_1(float %arg, float %unknown) {
 
 define float @clamp_is_ult_known_positive_to_1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ult_known_positive_to_1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_ULT_KNOWN_POSITIVE:%.*]] = fcmp ult float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULT_KNOWN_POSITIVE]], float 1.000000e+00, float [[ARG]]
@@ -1899,7 +1899,7 @@ define float @clamp_is_ult_known_positive_to_1(float %arg, float %unknown) {
 
 define float @clamp_is_ule_known_positive_to_1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ule_known_positive_to_1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_ULE_KNOWN_POSITIVE:%.*]] = fcmp ult float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULE_KNOWN_POSITIVE]], float 1.000000e+00, float [[ARG]]
@@ -1913,7 +1913,7 @@ define float @clamp_is_ule_known_positive_to_1(float %arg, float %unknown) {
 
 define float @clamp_is_olt_known_negative_to_neg1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_olt_known_negative_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[IS_OLT_NEGATIVE:%.*]] = fcmp olt float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -1929,7 +1929,7 @@ define float @clamp_is_olt_known_negative_to_neg1(float %arg, float %unknown) {
 
 define float @clamp_is_ole_known_negative_to_neg1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ole_known_negative_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[IS_OLE_NEGATIVE:%.*]] = fcmp ole float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -1945,7 +1945,7 @@ define float @clamp_is_ole_known_negative_to_neg1(float %arg, float %unknown) {
 
 define float @clamp_is_ogt_known_negative_to_neg1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ogt_known_negative_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[IS_OGT_NEGATIVE:%.*]] = fcmp ogt float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -1961,7 +1961,7 @@ define float @clamp_is_ogt_known_negative_to_neg1(float %arg, float %unknown) {
 
 define float @clamp_is_oge_known_negative_to_neg1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_oge_known_negative_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[IS_OGE_NEGATIVE:%.*]] = fcmp oge float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -1977,7 +1977,7 @@ define float @clamp_is_oge_known_negative_to_neg1(float %arg, float %unknown) {
 
 define float @clamp_is_ult_known_negative_to_neg1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ult_known_negative_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[IS_ULT_NEGATIVE:%.*]] = fcmp ult float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -1993,7 +1993,7 @@ define float @clamp_is_ult_known_negative_to_neg1(float %arg, float %unknown) {
 
 define float @clamp_is_ule_known_negative_to_neg1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ule_known_negative_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[IS_ULE_NEGATIVE:%.*]] = fcmp ule float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2009,7 +2009,7 @@ define float @clamp_is_ule_known_negative_to_neg1(float %arg, float %unknown) {
 
 define float @clamp_is_ugt_known_negative_to_neg1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_ugt_known_negative_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[IS_UGT_NEGATIVE:%.*]] = fcmp ugt float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2025,7 +2025,7 @@ define float @clamp_is_ugt_known_negative_to_neg1(float %arg, float %unknown) {
 
 define float @clamp_is_uge_known_negative_to_neg1(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @clamp_is_uge_known_negative_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[IS_UGE_NEGATIVE:%.*]] = fcmp uge float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2041,7 +2041,7 @@ define float @clamp_is_uge_known_negative_to_neg1(float %arg, float %unknown) {
 
 define float @ret_assumed_ogt_known_positive(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_ogt_known_positive(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OGT_KNOWN_POSITIVE:%.*]] = fcmp ogt float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGT_KNOWN_POSITIVE]]) #[[ATTR5]]
@@ -2055,7 +2055,7 @@ define float @ret_assumed_ogt_known_positive(float %arg, float %unknown) {
 
 define float @ret_assumed_oge_known_positive(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_oge_known_positive(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OGE_KNOWN_POSITIVE:%.*]] = fcmp oge float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGE_KNOWN_POSITIVE]]) #[[ATTR5]]
@@ -2069,7 +2069,7 @@ define float @ret_assumed_oge_known_positive(float %arg, float %unknown) {
 
 define float @ret_assumed_ugt_known_positive(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_ugt_known_positive(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UGT_KNOWN_POSITIVE:%.*]] = fcmp ugt float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGT_KNOWN_POSITIVE]]) #[[ATTR5]]
@@ -2083,7 +2083,7 @@ define float @ret_assumed_ugt_known_positive(float %arg, float %unknown) {
 
 define float @ret_assumed_uge_known_positive(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_uge_known_positive(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(ninf nzero nsub nnorm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UGE_KNOWN_POSITIVE:%.*]] = fcmp uge float [[ARG]], [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGE_KNOWN_POSITIVE]]) #[[ATTR5]]
@@ -2097,7 +2097,7 @@ define float @ret_assumed_uge_known_positive(float %arg, float %unknown) {
 
 define float @ret_assumed_olt_known_negative(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_olt_known_negative(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[OLT_KNOWN_NEGATIVE:%.*]] = fcmp olt float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2113,7 +2113,7 @@ define float @ret_assumed_olt_known_negative(float %arg, float %unknown) {
 
 define float @ret_assumed_ole_known_negative(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_ole_known_negative(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[OLE_KNOWN_NEGATIVE:%.*]] = fcmp ole float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2129,7 +2129,7 @@ define float @ret_assumed_ole_known_negative(float %arg, float %unknown) {
 
 define float @ret_assumed_ult_known_negative(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_ult_known_negative(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[ULT_KNOWN_NEGATIVE:%.*]] = fcmp ult float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2145,7 +2145,7 @@ define float @ret_assumed_ult_known_negative(float %arg, float %unknown) {
 
 define float @ret_assumed_ule_known_negative(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_ule_known_negative(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[ULE_KNOWN_NEGATIVE:%.*]] = fcmp ule float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2161,7 +2161,7 @@ define float @ret_assumed_ule_known_negative(float %arg, float %unknown) {
 
 define float @ret_assumed_ogt_known_negative(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_ogt_known_negative(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[OGT_KNOWN_NEGATIVE:%.*]] = fcmp ogt float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2177,7 +2177,7 @@ define float @ret_assumed_ogt_known_negative(float %arg, float %unknown) {
 
 define float @ret_assumed_oge_known_negative(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_oge_known_negative(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[OGE_KNOWN_NEGATIVE:%.*]] = fcmp oge float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2193,7 +2193,7 @@ define float @ret_assumed_oge_known_negative(float %arg, float %unknown) {
 
 define float @ret_assumed_ugt_known_negative(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_ugt_known_negative(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[UGT_KNOWN_NEGATIVE:%.*]] = fcmp ugt float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2209,7 +2209,7 @@ define float @ret_assumed_ugt_known_negative(float %arg, float %unknown) {
 
 define float @ret_assumed_uge_known_negative(float %arg, float %unknown) {
 ; CHECK-LABEL: define float @ret_assumed_uge_known_negative(
-; CHECK-SAME: float returned [[ARG:%.*]], float [[UNKNOWN:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: float returned [[ARG:%.*]], float nofpclass(inf zero sub norm) [[UNKNOWN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[KNOWN_POSITIVE:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[KNOWN_NEGATIVE:%.*]] = fneg float [[KNOWN_POSITIVE]]
 ; CHECK-NEXT:    [[UGE_KNOWN_NEGATIVE:%.*]] = fcmp uge float [[ARG]], [[KNOWN_NEGATIVE]]
@@ -2300,8 +2300,8 @@ define float @assume_uno_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_oeq_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_fabs_oeq_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_oeq_smallest_normal(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OEQ_SMALLEST_NORMAL:%.*]] = fcmp oeq float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_OEQ_SMALLEST_NORMAL]]) #[[ATTR5]]
@@ -2314,8 +2314,8 @@ define float @assume_fabs_oeq_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_one_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_fabs_one_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_one_smallest_normal(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_ONE_SMALLEST_NORMAL:%.*]] = fcmp one float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_ONE_SMALLEST_NORMAL]]) #[[ATTR5]]
@@ -2328,8 +2328,8 @@ define float @assume_fabs_one_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_ueq_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_fabs_ueq_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ueq_smallest_normal(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UEQ_SMALLEST_NORMAL:%.*]] = fcmp ueq float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_UEQ_SMALLEST_NORMAL]]) #[[ATTR5]]
@@ -2342,8 +2342,8 @@ define float @assume_fabs_ueq_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_une_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_fabs_une_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_une_smallest_normal(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UNE_SMALLEST_NORMAL:%.*]] = fcmp une float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_UNE_SMALLEST_NORMAL]]) #[[ATTR5]]
@@ -2356,8 +2356,8 @@ define float @assume_fabs_une_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_ord_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_fabs_ord_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ord_smallest_normal(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_ORD_SMALLEST_NORMAL:%.*]] = fcmp ord float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_ORD_SMALLEST_NORMAL]]) #[[ATTR5]]
@@ -2370,8 +2370,8 @@ define float @assume_fabs_ord_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_uno_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_fabs_uno_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_uno_smallest_normal(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UNO_SMALLEST_NORMAL:%.*]] = fcmp uno float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_UNO_SMALLEST_NORMAL]]) #[[ATTR5]]
diff --git a/llvm/test/Transforms/Attributor/nofpclass-ldexp.ll b/llvm/test/Transforms/Attributor/nofpclass-ldexp.ll
index 73ceaca90807b0..e64b300af6ed83 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-ldexp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-ldexp.ll
@@ -845,7 +845,7 @@ define float @ret_ldexp_f32_22(float %arg0) #0 {
 
 define float @ret_ldexp_f32_23(float %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ldexp_f32_23
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ldexp.f32.i32(float [[ARG0]], i32 noundef 23) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -855,7 +855,7 @@ define float @ret_ldexp_f32_23(float %arg0) #0 {
 
 define float @ret_ldexp_f32_24(float %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_ldexp_f32_24
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.ldexp.f32.i32(float [[ARG0]], i32 noundef 24) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -877,7 +877,7 @@ define float @ret_ldexp_f32_min24(float %arg0, i32 %arg1) #0 {
 
 define float @ret_ldexp_f32_23_nnan(float nofpclass(nan) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_ldexp_f32_23_nnan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) float @llvm.ldexp.f32.i32(float [[ARG0]], i32 noundef 23) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -907,7 +907,7 @@ define double @ret_ldexp_f64_51(double %arg0) #0 {
 
 define double @ret_ldexp_f64_52(double %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(sub) double @ret_ldexp_f64_52
-; CHECK-SAME: (double [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (double nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) double @llvm.ldexp.f64.i32(double [[ARG0]], i32 noundef 52) #[[ATTR10]]
 ; CHECK-NEXT:    ret double [[CALL]]
 ;
@@ -917,7 +917,7 @@ define double @ret_ldexp_f64_52(double %arg0) #0 {
 
 define double @ret_ldexp_f64_53(double %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(sub) double @ret_ldexp_f64_53
-; CHECK-SAME: (double [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (double nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) double @llvm.ldexp.f64.i32(double [[ARG0]], i32 noundef 53) #[[ATTR10]]
 ; CHECK-NEXT:    ret double [[CALL]]
 ;
@@ -947,7 +947,7 @@ define half @ret_ldexp_f16_9(half %arg0) #0 {
 
 define half @ret_ldexp_f16_10(half %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(sub) half @ret_ldexp_f16_10
-; CHECK-SAME: (half [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (half nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) half @llvm.ldexp.f16.i32(half [[ARG0]], i32 noundef 10) #[[ATTR10]]
 ; CHECK-NEXT:    ret half [[CALL]]
 ;
@@ -967,7 +967,7 @@ define bfloat @ret_ldexp_bf16_6(bfloat %arg0) #0 {
 
 define bfloat @ret_ldexp_bf16_7(bfloat %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(sub) bfloat @ret_ldexp_bf16_7
-; CHECK-SAME: (bfloat [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (bfloat nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) bfloat @llvm.ldexp.bf16.i32(bfloat [[ARG0]], i32 noundef 7) #[[ATTR10]]
 ; CHECK-NEXT:    ret bfloat [[CALL]]
 ;
@@ -977,7 +977,7 @@ define bfloat @ret_ldexp_bf16_7(bfloat %arg0) #0 {
 
 define bfloat @ret_ldexp_bf16_8(bfloat %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(sub) bfloat @ret_ldexp_bf16_8
-; CHECK-SAME: (bfloat [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (bfloat nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) bfloat @llvm.ldexp.bf16.i32(bfloat [[ARG0]], i32 noundef 8) #[[ATTR10]]
 ; CHECK-NEXT:    ret bfloat [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-log.ll b/llvm/test/Transforms/Attributor/nofpclass-log.ll
index 46e7ad3a248bfd..b1f81b2be52aea 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-log.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-log.ll
@@ -307,7 +307,7 @@ define float @constrained_log_nonegzero(float nofpclass(nzero) %arg) strictfp {
 
 define float @constrained_log_nozero(float nofpclass(zero) %arg) strictfp {
 ; CHECK-LABEL: define nofpclass(ninf) float @constrained_log_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(ninf zero) [[ARG:%.*]]) #[[ATTR9]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(ninf) float @llvm.experimental.constrained.log.f32(float [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR11]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-minimum-maximum.ll b/llvm/test/Transforms/Attributor/nofpclass-minimum-maximum.ll
index 09c7c4b3fb8582..377dfaf7fa02a2 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-minimum-maximum.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-minimum-maximum.ll
@@ -366,7 +366,7 @@ define float @ret_minimum_any__noneg_nan(float %arg0, float nofpclass(ninf nsub
 
 define float @ret_minimum_nopos_nan__any(float nofpclass(pinf psub pnorm nan) %arg0, float %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(pinf psub pnorm) float @ret_minimum_nopos_nan__any
-; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf psub pnorm) float @llvm.minimum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -376,7 +376,7 @@ define float @ret_minimum_nopos_nan__any(float nofpclass(pinf psub pnorm nan) %a
 
 define float @ret_minimum_any__nopos_nan(float %arg0, float nofpclass(pinf psub pnorm nan) %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(pinf psub pnorm) float @ret_minimum_any__nopos_nan
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf psub pnorm) float @llvm.minimum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -406,7 +406,7 @@ define float @ret_minimum_any__noneg(float %arg0, float nofpclass(ninf nsub nnor
 
 define float @ret_minimum_nopos__any(float nofpclass(pinf psub pnorm) %arg0, float %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(pinf psub pnorm) float @ret_minimum_nopos__any
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf psub pnorm) float @llvm.minimum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -416,7 +416,7 @@ define float @ret_minimum_nopos__any(float nofpclass(pinf psub pnorm) %arg0, flo
 
 define float @ret_minimum_any__nopos(float %arg0, float nofpclass(pinf psub pnorm) %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(pinf psub pnorm) float @ret_minimum_any__nopos
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf psub pnorm) float @llvm.minimum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -426,7 +426,7 @@ define float @ret_minimum_any__nopos(float %arg0, float nofpclass(pinf psub pnor
 
 define float @ret_maximum_noneg_nan__any(float nofpclass(ninf nsub nnorm nan) %arg0, float %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_maximum_noneg_nan__any
-; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nsub nnorm) float @llvm.maximum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -436,7 +436,7 @@ define float @ret_maximum_noneg_nan__any(float nofpclass(ninf nsub nnorm nan) %a
 
 define float @ret_maximum_any__noneg_nan(float %arg0, float nofpclass(ninf nsub nnorm nan) %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_maximum_any__noneg_nan
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nsub nnorm) float @llvm.maximum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -466,7 +466,7 @@ define float @ret_maximum_any__nopos_nan(float %arg0, float nofpclass(pinf psub
 
 define float @ret_maximum_noneg__any(float nofpclass(ninf nsub nnorm) %arg0, float %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_maximum_noneg__any
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nsub nnorm) float @llvm.maximum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -476,7 +476,7 @@ define float @ret_maximum_noneg__any(float nofpclass(ninf nsub nnorm) %arg0, flo
 
 define float @ret_maximum_any__noneg(float %arg0, float nofpclass(ninf nsub nnorm) %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_maximum_any__noneg
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nsub nnorm) float @llvm.maximum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-minnum-maxnum.ll b/llvm/test/Transforms/Attributor/nofpclass-minnum-maxnum.ll
index 8be3d59a9b3b45..152a1fedffc326 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-minnum-maxnum.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-minnum-maxnum.ll
@@ -27,7 +27,7 @@ define float @ret_minnum_noinf__noinf(float nofpclass(inf) %arg0, float nofpclas
 
 define float @ret_minnum_noinf__nonan(float nofpclass(inf) %arg0, float nofpclass(nan) %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_minnum_noinf__nonan
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -37,7 +37,7 @@ define float @ret_minnum_noinf__nonan(float nofpclass(inf) %arg0, float nofpclas
 
 define float @ret_minnum_nonan__noinf(float nofpclass(nan) %arg0, float nofpclass(inf) %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_minnum_nonan__noinf
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -97,7 +97,7 @@ define float @ret_maxnum_noinf__noinf(float nofpclass(inf) %arg0, float nofpclas
 
 define float @ret_maxnum_noinf__nonan(float nofpclass(inf) %arg0, float nofpclass(nan) %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_maxnum_noinf__nonan
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maxnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -107,7 +107,7 @@ define float @ret_maxnum_noinf__nonan(float nofpclass(inf) %arg0, float nofpclas
 
 define float @ret_maxnum_nonan__noinf(float nofpclass(nan) %arg0, float nofpclass(inf) %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_maxnum_nonan__noinf
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maxnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -346,7 +346,7 @@ define float @ret_minnum_dapz_ieee_nozero__nozero(float nofpclass(zero) %arg0, f
 
 define float @ret_minnum_noneg_nan__any(float nofpclass(ninf nsub nnorm nan) %arg0, float %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_minnum_noneg_nan__any
-; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -356,7 +356,7 @@ define float @ret_minnum_noneg_nan__any(float nofpclass(ninf nsub nnorm nan) %ar
 
 define float @ret_minnum_any__noneg_nan(float %arg0, float nofpclass(ninf nsub nnorm nan) %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_minnum_any__noneg_nan
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -366,7 +366,7 @@ define float @ret_minnum_any__noneg_nan(float %arg0, float nofpclass(ninf nsub n
 
 define float @ret_minnum_nopos_nan__any(float nofpclass(pinf psub pnorm nan) %arg0, float %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_minnum_nopos_nan__any
-; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf psub pnorm) float @llvm.minnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -376,7 +376,7 @@ define float @ret_minnum_nopos_nan__any(float nofpclass(pinf psub pnorm nan) %ar
 
 define float @ret_minnum_any__nopos_nan(float %arg0, float nofpclass(pinf psub pnorm nan) %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_minnum_any__nopos_nan
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf psub pnorm) float @llvm.minnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -426,7 +426,7 @@ define float @ret_minnum_any__nopos(float %arg0, float nofpclass(pinf psub pnorm
 
 define float @ret_maxnum_noneg_nan__any(float nofpclass(ninf nsub nnorm nan) %arg0, float %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_maxnum_noneg_nan__any
-; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nsub nnorm) float @llvm.maxnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -436,7 +436,7 @@ define float @ret_maxnum_noneg_nan__any(float nofpclass(ninf nsub nnorm nan) %ar
 
 define float @ret_maxnum_any__noneg_nan(float %arg0, float nofpclass(ninf nsub nnorm nan) %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_maxnum_any__noneg_nan
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nsub nnorm) float @llvm.maxnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -446,7 +446,7 @@ define float @ret_maxnum_any__noneg_nan(float %arg0, float nofpclass(ninf nsub n
 
 define float @ret_maxnum_nopos_nan__any(float nofpclass(pinf psub pnorm nan) %arg0, float %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_maxnum_nopos_nan__any
-; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maxnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -456,7 +456,7 @@ define float @ret_maxnum_nopos_nan__any(float nofpclass(pinf psub pnorm nan) %ar
 
 define float @ret_maxnum_any__nopos_nan(float %arg0, float nofpclass(pinf psub pnorm nan) %arg1) #3 {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_maxnum_any__nopos_nan
-; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maxnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-nan-fmul.ll b/llvm/test/Transforms/Attributor/nofpclass-nan-fmul.ll
index fbf6c2e0981fb1..d3a46331bac45b 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-nan-fmul.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-nan-fmul.ll
@@ -166,7 +166,7 @@ define float @ret_fmul_daz_nonan_nozero_nosub__nonan_nozero(float nofpclass(nan
 
 define float @ret_fmul_square(float %arg) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_fmul_square
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FMUL]]
 ;
@@ -176,7 +176,7 @@ define float @ret_fmul_square(float %arg) #0 {
 
 define float @ret_fmul_square_nnan(float nofpclass(nan) %arg) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_fmul_square_nnan
-; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FMUL]]
 ;
@@ -186,7 +186,7 @@ define float @ret_fmul_square_nnan(float nofpclass(nan) %arg) #0 {
 
 define float @ret_fmul_square_nnan_nzero(float nofpclass(nan zero) %arg) #0 {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_fmul_square_nnan_nzero
-; CHECK-SAME: (float nofpclass(nan zero) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (float nofpclass(nan ninf zero nsub nnorm) [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FMUL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-nearbyint.ll b/llvm/test/Transforms/Attributor/nofpclass-nearbyint.ll
index 5962e24312f646..8dcd5c405502cf 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-nearbyint.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-nearbyint.ll
@@ -6,7 +6,7 @@ declare ppc_fp128 @llvm.nearbyint.ppcf128(ppc_fp128)
 
 define float @ret_nearbyint(float %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -16,7 +16,7 @@ define float @ret_nearbyint(float %arg0) {
 
 define float @ret_nearbyint_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @ret_nearbyint_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -26,7 +26,7 @@ define float @ret_nearbyint_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_nearbyint_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_nearbyint_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -36,7 +36,7 @@ define float @ret_nearbyint_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_nearbyint_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_nearbyint_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -46,7 +46,7 @@ define float @ret_nearbyint_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_nearbyint_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_nearbyint_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -56,7 +56,7 @@ define float @ret_nearbyint_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_nearbyint_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -66,7 +66,7 @@ define float @ret_nearbyint_noqnan(float nofpclass(qnan) %arg0) {
 
 define float @ret_nearbyint_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) float @ret_nearbyint_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -76,7 +76,7 @@ define float @ret_nearbyint_nosnan(float nofpclass(snan) %arg0) {
 
 define float @ret_nearbyint_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -86,7 +86,7 @@ define float @ret_nearbyint_nozero(float nofpclass(zero) %arg0) {
 
 define float @ret_nearbyint_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -96,7 +96,7 @@ define float @ret_nearbyint_nopzero(float nofpclass(pzero) %arg0) {
 
 define float @ret_nearbyint_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -106,7 +106,7 @@ define float @ret_nearbyint_nonzero(float nofpclass(nzero) %arg0) {
 
 define float @ret_nearbyint_nonorm(float nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nonorm
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -116,7 +116,7 @@ define float @ret_nearbyint_nonorm(float nofpclass(norm) %arg0) {
 
 define float @ret_nearbyint_nonnorm(float nofpclass(nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nonnorm
-; CHECK-SAME: (float nofpclass(nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -126,7 +126,7 @@ define float @ret_nearbyint_nonnorm(float nofpclass(nnorm) %arg0) {
 
 define float @ret_nearbyint_nopnorm(float nofpclass(pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nopnorm
-; CHECK-SAME: (float nofpclass(pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -136,7 +136,7 @@ define float @ret_nearbyint_nopnorm(float nofpclass(pnorm) %arg0) {
 
 define float @ret_nearbyint_nonsub(float nofpclass(nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nonsub
-; CHECK-SAME: (float nofpclass(nsub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -146,7 +146,7 @@ define float @ret_nearbyint_nonsub(float nofpclass(nsub) %arg0) {
 
 define float @ret_nearbyint_nopsub(float nofpclass(psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nopsub
-; CHECK-SAME: (float nofpclass(psub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -166,7 +166,7 @@ define float @ret_nearbyint_nonorm_nosub(float nofpclass(norm sub) %arg0) {
 
 define float @ret_nearbyint_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nopnorm_nopsub
-; CHECK-SAME: (float nofpclass(psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -176,7 +176,7 @@ define float @ret_nearbyint_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 
 define float @ret_nearbyint_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nonnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -186,7 +186,7 @@ define float @ret_nearbyint_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 
 define float @ret_nearbyint_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_nearbyint_nopnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -196,7 +196,7 @@ define float @ret_nearbyint_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 
 define ppc_fp128 @ret_nearbyint_ppcf128(ppc_fp128 %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_nearbyint_ppcf128
-; CHECK-SAME: (ppc_fp128 [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.nearbyint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -206,7 +206,7 @@ define ppc_fp128 @ret_nearbyint_ppcf128(ppc_fp128 %arg0) {
 
 define ppc_fp128 @ret_nearbyint_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_nearbyint_noinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.nearbyint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -216,7 +216,7 @@ define ppc_fp128 @ret_nearbyint_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 
 define ppc_fp128 @ret_nearbyint_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_nearbyint_nopinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.nearbyint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -226,7 +226,7 @@ define ppc_fp128 @ret_nearbyint_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0)
 
 define ppc_fp128 @ret_nearbyint_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_nearbyint_noninf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.nearbyint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -236,7 +236,7 @@ define ppc_fp128 @ret_nearbyint_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0)
 
 define ppc_fp128 @ret_nearbyint_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) ppc_fp128 @ret_nearbyint_nonan_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) ppc_fp128 @llvm.nearbyint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -246,7 +246,7 @@ define ppc_fp128 @ret_nearbyint_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 
 define float @ret_nearbyint_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_nearbyint_noneg
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -256,7 +256,7 @@ define float @ret_nearbyint_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 
 define float @ret_nearbyint_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_nearbyint_noneg_nonegzero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -266,7 +266,7 @@ define float @ret_nearbyint_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzer
 
 define float @ret_nearbyint_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_nearbyint_noneg_nonegzero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -276,7 +276,7 @@ define float @ret_nearbyint_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnor
 
 define float @ret_nearbyint_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_nearbyint_noneg_nozero
-; CHECK-SAME: (float nofpclass(ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -286,7 +286,7 @@ define float @ret_nearbyint_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %
 
 define float @ret_nearbyint_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_nearbyint_noneg_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -296,7 +296,7 @@ define float @ret_nearbyint_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm z
 
 define float @ret_nearbyint_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_nearbyint_nopos
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -306,7 +306,7 @@ define float @ret_nearbyint_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 
 define float @ret_nearbyint_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_nearbyint_nopos_nopzero
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -316,7 +316,7 @@ define float @ret_nearbyint_nopos_nopzero(float nofpclass(pinf psub pnorm pzero)
 
 define float @ret_nearbyint_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_nearbyint_nopos_nopzero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -326,7 +326,7 @@ define float @ret_nearbyint_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm
 
 define float @ret_nearbyint_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_nearbyint_nopos_nozero
-; CHECK-SAME: (float nofpclass(pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -336,7 +336,7 @@ define float @ret_nearbyint_nopos_nozero(float nofpclass(pinf psub pnorm zero) %
 
 define float @ret_nearbyint_nopos_nozero_nonan(float nofpclass(pinf psub pnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_nearbyint_nopos_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.nearbyint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-powi.ll b/llvm/test/Transforms/Attributor/nofpclass-powi.ll
index e216bf7ed59b59..2472a547c1a9df 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-powi.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-powi.ll
@@ -49,7 +49,7 @@ define float @ret_powi_f32_odd_constant(float %arg0) #0 {
 
 define float @ret_powi_f32_even_constant(float %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_powi_f32_even_constant
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.powi.f32.i32(float [[ARG0]], i32 noundef 4) #[[ATTR6]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -59,7 +59,7 @@ define float @ret_powi_f32_even_constant(float %arg0) #0 {
 
 define <2 x float> @ret_powi_v2f32_even_nonsplat(<2 x float> %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) <2 x float> @ret_powi_v2f32_even_nonsplat
-; CHECK-SAME: (<2 x float> [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (<2 x float> nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) <2 x float> @llvm.powi.v2f32.v2i32(<2 x float> [[ARG0]], <2 x i32> noundef <i32 2, i32 4>) #[[ATTR6]]
 ; CHECK-NEXT:    ret <2 x float> [[CALL]]
 ;
@@ -79,7 +79,7 @@ define <2 x float> @ret_powi_v2f32_odd_nonsplat(<2 x float> %arg0) #0 {
 
 define float @ret_powi_f32_masked_to_even(float %arg0, i32 %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_powi_f32_masked_to_even
-; CHECK-SAME: (float [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[KNOWN_EVEN:%.*]] = and i32 [[ARG1]], -2
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.powi.f32.i32(float [[ARG0]], i32 [[KNOWN_EVEN]]) #[[ATTR6]]
 ; CHECK-NEXT:    ret float [[CALL]]
@@ -91,7 +91,7 @@ define float @ret_powi_f32_masked_to_even(float %arg0, i32 %arg1) #0 {
 
 define float @ret_powi_f32_masked_to_even_extrabits(float %arg0, i32 %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_powi_f32_masked_to_even_extrabits
-; CHECK-SAME: (float [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[KNOWN_EVEN:%.*]] = and i32 [[ARG1]], -4
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.powi.f32.i32(float [[ARG0]], i32 [[KNOWN_EVEN]]) #[[ATTR6]]
 ; CHECK-NEXT:    ret float [[CALL]]
@@ -103,7 +103,7 @@ define float @ret_powi_f32_masked_to_even_extrabits(float %arg0, i32 %arg1) #0 {
 
 define <2 x float> @ret_powi_v2f32_masked_to_even(<2 x float> %arg0, <2 x i32> %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) <2 x float> @ret_powi_v2f32_masked_to_even
-; CHECK-SAME: (<2 x float> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (<2 x float> nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[KNOWN_EVEN:%.*]] = and <2 x i32> [[ARG1]], <i32 -2, i32 -2>
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero nsub nnorm) <2 x float> @llvm.powi.v2f32.v2i32(<2 x float> [[ARG0]], <2 x i32> [[KNOWN_EVEN]]) #[[ATTR6]]
 ; CHECK-NEXT:    ret <2 x float> [[CALL]]
@@ -255,7 +255,7 @@ define <4 x float> @powi_v4f32_i32_nozero(<4 x float> nofpclass(zero) %arg0, i32
 
 define <4 x float> @powi_v4f32_i32_constint_even(<4 x float> %arg) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) <4 x float> @powi_v4f32_i32_constint_even
-; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR5]] {
+; CHECK-SAME: (<4 x float> nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR5]] {
 ; CHECK-NEXT:    [[POWI:%.*]] = call nofpclass(ninf nzero nsub nnorm) <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[ARG]], i32 noundef 4) #[[ATTR6]]
 ; CHECK-NEXT:    ret <4 x float> [[POWI]]
 ;
@@ -275,7 +275,7 @@ define <4 x float> @powi_v4f32_i32_constint_odd(<4 x float> %arg) {
 
 define <4 x float> @powi_v4f32_i32_regression(<4 x float> %arg) {
 ; CHECK-LABEL: define nofpclass(nzero) <4 x float> @powi_v4f32_i32_regression
-; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR5]] {
+; CHECK-SAME: (<4 x float> nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR5]] {
 ; CHECK-NEXT:    [[POWI:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[ARG]], i32 noundef 4) #[[ATTR6]]
 ; CHECK-NEXT:    [[USER:%.*]] = fsub <4 x float> [[POWI]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    ret <4 x float> [[USER]]
diff --git a/llvm/test/Transforms/Attributor/nofpclass-rint.ll b/llvm/test/Transforms/Attributor/nofpclass-rint.ll
index 89af06e2d9e790..27b95e0d4abe61 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-rint.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-rint.ll
@@ -6,7 +6,7 @@ declare ppc_fp128 @llvm.rint.ppcf128(ppc_fp128)
 
 define float @ret_rint(float %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -16,7 +16,7 @@ define float @ret_rint(float %arg0) {
 
 define float @ret_rint_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @ret_rint_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -26,7 +26,7 @@ define float @ret_rint_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_rint_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_rint_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -36,7 +36,7 @@ define float @ret_rint_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_rint_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_rint_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -46,7 +46,7 @@ define float @ret_rint_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_rint_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_rint_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -56,7 +56,7 @@ define float @ret_rint_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_rint_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -66,7 +66,7 @@ define float @ret_rint_noqnan(float nofpclass(qnan) %arg0) {
 
 define float @ret_rint_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) float @ret_rint_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -76,7 +76,7 @@ define float @ret_rint_nosnan(float nofpclass(snan) %arg0) {
 
 define float @ret_rint_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -86,7 +86,7 @@ define float @ret_rint_nozero(float nofpclass(zero) %arg0) {
 
 define float @ret_rint_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -96,7 +96,7 @@ define float @ret_rint_nopzero(float nofpclass(pzero) %arg0) {
 
 define float @ret_rint_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -106,7 +106,7 @@ define float @ret_rint_nonzero(float nofpclass(nzero) %arg0) {
 
 define float @ret_rint_nonorm(float nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nonorm
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -116,7 +116,7 @@ define float @ret_rint_nonorm(float nofpclass(norm) %arg0) {
 
 define float @ret_rint_nonnorm(float nofpclass(nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nonnorm
-; CHECK-SAME: (float nofpclass(nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -126,7 +126,7 @@ define float @ret_rint_nonnorm(float nofpclass(nnorm) %arg0) {
 
 define float @ret_rint_nopnorm(float nofpclass(pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nopnorm
-; CHECK-SAME: (float nofpclass(pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -136,7 +136,7 @@ define float @ret_rint_nopnorm(float nofpclass(pnorm) %arg0) {
 
 define float @ret_rint_nonsub(float nofpclass(nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nonsub
-; CHECK-SAME: (float nofpclass(nsub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -146,7 +146,7 @@ define float @ret_rint_nonsub(float nofpclass(nsub) %arg0) {
 
 define float @ret_rint_nopsub(float nofpclass(psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nopsub
-; CHECK-SAME: (float nofpclass(psub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -166,7 +166,7 @@ define float @ret_rint_nonorm_nosub(float nofpclass(norm sub) %arg0) {
 
 define float @ret_rint_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nopnorm_nopsub
-; CHECK-SAME: (float nofpclass(psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -176,7 +176,7 @@ define float @ret_rint_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 
 define float @ret_rint_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nonnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -186,7 +186,7 @@ define float @ret_rint_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 
 define float @ret_rint_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_rint_nopnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -196,7 +196,7 @@ define float @ret_rint_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 
 define ppc_fp128 @ret_rint_ppcf128(ppc_fp128 %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_rint_ppcf128
-; CHECK-SAME: (ppc_fp128 [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.rint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -206,7 +206,7 @@ define ppc_fp128 @ret_rint_ppcf128(ppc_fp128 %arg0) {
 
 define ppc_fp128 @ret_rint_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_rint_noinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.rint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -216,7 +216,7 @@ define ppc_fp128 @ret_rint_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 
 define ppc_fp128 @ret_rint_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_rint_nopinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.rint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -226,7 +226,7 @@ define ppc_fp128 @ret_rint_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 
 define ppc_fp128 @ret_rint_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_rint_noninf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.rint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -236,7 +236,7 @@ define ppc_fp128 @ret_rint_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 
 define ppc_fp128 @ret_rint_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) ppc_fp128 @ret_rint_nonan_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) ppc_fp128 @llvm.rint.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -246,7 +246,7 @@ define ppc_fp128 @ret_rint_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 
 define float @ret_rint_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_rint_noneg
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -256,7 +256,7 @@ define float @ret_rint_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 
 define float @ret_rint_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_rint_noneg_nonegzero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -266,7 +266,7 @@ define float @ret_rint_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %a
 
 define float @ret_rint_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_rint_noneg_nonegzero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -276,7 +276,7 @@ define float @ret_rint_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nze
 
 define float @ret_rint_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_rint_noneg_nozero
-; CHECK-SAME: (float nofpclass(ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -286,7 +286,7 @@ define float @ret_rint_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0)
 
 define float @ret_rint_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_rint_noneg_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -296,7 +296,7 @@ define float @ret_rint_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero n
 
 define float @ret_rint_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_rint_nopos
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -306,7 +306,7 @@ define float @ret_rint_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 
 define float @ret_rint_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_rint_nopos_nopzero
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -316,7 +316,7 @@ define float @ret_rint_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg
 
 define float @ret_rint_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_rint_nopos_nopzero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -326,7 +326,7 @@ define float @ret_rint_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero
 
 define float @ret_rint_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_rint_nopos_nozero
-; CHECK-SAME: (float nofpclass(pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -336,7 +336,7 @@ define float @ret_rint_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0)
 
 define float @ret_rint_nopos_nozero_nonan(float nofpclass(pinf psub pnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_rint_nopos_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.rint.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-round.ll b/llvm/test/Transforms/Attributor/nofpclass-round.ll
index cadfeb331b3bef..0f782910ca7fe0 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-round.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-round.ll
@@ -7,12 +7,12 @@ declare ppc_fp128 @llvm.round.ppcf128(ppc_fp128)
 
 define float @ret_round(float %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round
-; LIGHT-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIGHT-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -22,12 +22,12 @@ define float @ret_round(float %arg0) {
 
 define float @ret_round_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @ret_round_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_noinf
-; LIGHT-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -37,12 +37,12 @@ define float @ret_round_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_round_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_round_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopinf
-; LIGHT-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -52,12 +52,12 @@ define float @ret_round_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_round_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_round_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_noninf
-; LIGHT-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -67,12 +67,12 @@ define float @ret_round_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_round_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_round_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nonan
-; LIGHT-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -82,12 +82,12 @@ define float @ret_round_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_round_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_noqnan
-; LIGHT-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -97,12 +97,12 @@ define float @ret_round_noqnan(float nofpclass(qnan) %arg0) {
 
 define float @ret_round_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) float @ret_round_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nosnan
-; LIGHT-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -112,12 +112,12 @@ define float @ret_round_nosnan(float nofpclass(snan) %arg0) {
 
 define float @ret_round_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nozero
-; LIGHT-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -127,12 +127,12 @@ define float @ret_round_nozero(float nofpclass(zero) %arg0) {
 
 define float @ret_round_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopzero
-; LIGHT-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -142,12 +142,12 @@ define float @ret_round_nopzero(float nofpclass(pzero) %arg0) {
 
 define float @ret_round_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nonzero
-; LIGHT-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -157,12 +157,12 @@ define float @ret_round_nonzero(float nofpclass(nzero) %arg0) {
 
 define float @ret_round_nonorm(float nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nonorm
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nonorm
-; LIGHT-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -172,12 +172,12 @@ define float @ret_round_nonorm(float nofpclass(norm) %arg0) {
 
 define float @ret_round_nonnorm(float nofpclass(nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nonnorm
-; CHECK-SAME: (float nofpclass(nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nonnorm
-; LIGHT-SAME: (float nofpclass(nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -187,12 +187,12 @@ define float @ret_round_nonnorm(float nofpclass(nnorm) %arg0) {
 
 define float @ret_round_nopnorm(float nofpclass(pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nopnorm
-; CHECK-SAME: (float nofpclass(pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopnorm
-; LIGHT-SAME: (float nofpclass(pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -202,12 +202,12 @@ define float @ret_round_nopnorm(float nofpclass(pnorm) %arg0) {
 
 define float @ret_round_nonsub(float nofpclass(nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nonsub
-; CHECK-SAME: (float nofpclass(nsub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nonsub
-; LIGHT-SAME: (float nofpclass(nsub) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -217,12 +217,12 @@ define float @ret_round_nonsub(float nofpclass(nsub) %arg0) {
 
 define float @ret_round_nopsub(float nofpclass(psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nopsub
-; CHECK-SAME: (float nofpclass(psub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopsub
-; LIGHT-SAME: (float nofpclass(psub) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -247,12 +247,12 @@ define float @ret_round_nonorm_nosub(float nofpclass(norm sub) %arg0) {
 
 define float @ret_round_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nopnorm_nopsub
-; CHECK-SAME: (float nofpclass(psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopnorm_nopsub
-; LIGHT-SAME: (float nofpclass(psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -262,12 +262,12 @@ define float @ret_round_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 
 define float @ret_round_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nonnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nonnorm_nonsub
-; LIGHT-SAME: (float nofpclass(nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -277,12 +277,12 @@ define float @ret_round_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 
 define float @ret_round_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nopnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopnorm_nonsub
-; LIGHT-SAME: (float nofpclass(nsub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -292,12 +292,12 @@ define float @ret_round_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 
 define ppc_fp128 @ret_round_ppcf128(ppc_fp128 %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_round_ppcf128
-; CHECK-SAME: (ppc_fp128 [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
 ; LIGHT-LABEL: define ppc_fp128 @ret_round_ppcf128
-; LIGHT-SAME: (ppc_fp128 [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (ppc_fp128 nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -307,12 +307,12 @@ define ppc_fp128 @ret_round_ppcf128(ppc_fp128 %arg0) {
 
 define ppc_fp128 @ret_round_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_round_noinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
 ; LIGHT-LABEL: define ppc_fp128 @ret_round_noinf_ppcf128
-; LIGHT-SAME: (ppc_fp128 nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (ppc_fp128 nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -322,12 +322,12 @@ define ppc_fp128 @ret_round_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 
 define ppc_fp128 @ret_round_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_round_nopinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
 ; LIGHT-LABEL: define ppc_fp128 @ret_round_nopinf_ppcf128
-; LIGHT-SAME: (ppc_fp128 nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (ppc_fp128 nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -337,12 +337,12 @@ define ppc_fp128 @ret_round_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 
 define ppc_fp128 @ret_round_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_round_noninf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
 ; LIGHT-LABEL: define ppc_fp128 @ret_round_noninf_ppcf128
-; LIGHT-SAME: (ppc_fp128 nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (ppc_fp128 nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -352,12 +352,12 @@ define ppc_fp128 @ret_round_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 
 define ppc_fp128 @ret_round_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) ppc_fp128 @ret_round_nonan_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
 ; LIGHT-LABEL: define ppc_fp128 @ret_round_nonan_ppcf128
-; LIGHT-SAME: (ppc_fp128 nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (ppc_fp128 nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call ppc_fp128 @llvm.round.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -367,12 +367,12 @@ define ppc_fp128 @ret_round_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 
 define float @ret_round_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_round_noneg
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_noneg
-; LIGHT-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -382,12 +382,12 @@ define float @ret_round_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 
 define float @ret_round_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_round_noneg_nonegzero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_noneg_nonegzero
-; LIGHT-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -397,12 +397,12 @@ define float @ret_round_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %
 
 define float @ret_round_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_round_noneg_nonegzero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_noneg_nonegzero_nonan
-; LIGHT-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -412,12 +412,12 @@ define float @ret_round_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nz
 
 define float @ret_round_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_round_noneg_nozero
-; CHECK-SAME: (float nofpclass(ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_noneg_nozero
-; LIGHT-SAME: (float nofpclass(ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -427,12 +427,12 @@ define float @ret_round_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0
 
 define float @ret_round_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_round_noneg_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_noneg_nozero_nonan
-; LIGHT-SAME: (float nofpclass(nan ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(nan ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -442,12 +442,12 @@ define float @ret_round_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero
 
 define float @ret_round_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_round_nopos
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopos
-; LIGHT-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -457,12 +457,12 @@ define float @ret_round_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 
 define float @ret_round_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_round_nopos_nopzero
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopos_nopzero
-; LIGHT-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -472,12 +472,12 @@ define float @ret_round_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %ar
 
 define float @ret_round_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_round_nopos_nopzero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopos_nopzero_nonan
-; LIGHT-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -487,12 +487,12 @@ define float @ret_round_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzer
 
 define float @ret_round_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_round_nopos_nozero
-; CHECK-SAME: (float nofpclass(pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopos_nozero
-; LIGHT-SAME: (float nofpclass(pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -502,12 +502,12 @@ define float @ret_round_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0
 
 define float @ret_round_nopos_nozero_nonan(float nofpclass(pinf psub pnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_round_nopos_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopos_nozero_nonan
-; LIGHT-SAME: (float nofpclass(nan pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(nan pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -517,12 +517,12 @@ define float @ret_round_nopos_nozero_nonan(float nofpclass(pinf psub pnorm zero
 
 define float @ret_round_nopzero_nopnorm(float nofpclass(pzero pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nopzero_nopnorm
-; CHECK-SAME: (float nofpclass(pzero pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nopzero_nopnorm
-; LIGHT-SAME: (float nofpclass(pzero pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -532,12 +532,12 @@ define float @ret_round_nopzero_nopnorm(float nofpclass(pzero pnorm) %arg0) {
 
 define float @ret_round_nonzero_nonnorm(float nofpclass(nzero nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nonzero_nonnorm
-; CHECK-SAME: (float nofpclass(nzero nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nonzero_nonnorm
-; LIGHT-SAME: (float nofpclass(nzero nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
@@ -547,12 +547,12 @@ define float @ret_round_nonzero_nonnorm(float nofpclass(nzero nnorm) %arg0) {
 
 define float @ret_round_nozero_nonorm(float nofpclass(zero norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_round_nozero_nonorm
-; CHECK-SAME: (float nofpclass(zero norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 ; LIGHT-LABEL: define float @ret_round_nozero_nonorm
-; LIGHT-SAME: (float nofpclass(zero norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; LIGHT-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; LIGHT-NEXT:    [[CALL:%.*]] = call float @llvm.round.f32(float [[ARG0]]) #[[ATTR2]]
 ; LIGHT-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-roundeven.ll b/llvm/test/Transforms/Attributor/nofpclass-roundeven.ll
index 6fae01d9cc2105..822ca51115bcc6 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-roundeven.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-roundeven.ll
@@ -6,7 +6,7 @@ declare ppc_fp128 @llvm.roundeven.ppcf128(ppc_fp128)
 
 define float @ret_roundeven(float %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -16,7 +16,7 @@ define float @ret_roundeven(float %arg0) {
 
 define float @ret_roundeven_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @ret_roundeven_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -26,7 +26,7 @@ define float @ret_roundeven_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_roundeven_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_roundeven_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -36,7 +36,7 @@ define float @ret_roundeven_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_roundeven_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_roundeven_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -46,7 +46,7 @@ define float @ret_roundeven_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_roundeven_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_roundeven_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -56,7 +56,7 @@ define float @ret_roundeven_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_roundeven_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -66,7 +66,7 @@ define float @ret_roundeven_noqnan(float nofpclass(qnan) %arg0) {
 
 define float @ret_roundeven_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) float @ret_roundeven_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -76,7 +76,7 @@ define float @ret_roundeven_nosnan(float nofpclass(snan) %arg0) {
 
 define float @ret_roundeven_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -86,7 +86,7 @@ define float @ret_roundeven_nozero(float nofpclass(zero) %arg0) {
 
 define float @ret_roundeven_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -96,7 +96,7 @@ define float @ret_roundeven_nopzero(float nofpclass(pzero) %arg0) {
 
 define float @ret_roundeven_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -106,7 +106,7 @@ define float @ret_roundeven_nonzero(float nofpclass(nzero) %arg0) {
 
 define float @ret_roundeven_nonorm(float nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nonorm
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -116,7 +116,7 @@ define float @ret_roundeven_nonorm(float nofpclass(norm) %arg0) {
 
 define float @ret_roundeven_nonnorm(float nofpclass(nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nonnorm
-; CHECK-SAME: (float nofpclass(nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -126,7 +126,7 @@ define float @ret_roundeven_nonnorm(float nofpclass(nnorm) %arg0) {
 
 define float @ret_roundeven_nopnorm(float nofpclass(pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nopnorm
-; CHECK-SAME: (float nofpclass(pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -136,7 +136,7 @@ define float @ret_roundeven_nopnorm(float nofpclass(pnorm) %arg0) {
 
 define float @ret_roundeven_nonsub(float nofpclass(nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nonsub
-; CHECK-SAME: (float nofpclass(nsub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -146,7 +146,7 @@ define float @ret_roundeven_nonsub(float nofpclass(nsub) %arg0) {
 
 define float @ret_roundeven_nopsub(float nofpclass(psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nopsub
-; CHECK-SAME: (float nofpclass(psub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -166,7 +166,7 @@ define float @ret_roundeven_nonorm_nosub(float nofpclass(norm sub) %arg0) {
 
 define float @ret_roundeven_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nopnorm_nopsub
-; CHECK-SAME: (float nofpclass(psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -176,7 +176,7 @@ define float @ret_roundeven_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 
 define float @ret_roundeven_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nonnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -186,7 +186,7 @@ define float @ret_roundeven_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 
 define float @ret_roundeven_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nopnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -196,7 +196,7 @@ define float @ret_roundeven_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 
 define ppc_fp128 @ret_roundeven_ppcf128(ppc_fp128 %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_roundeven_ppcf128
-; CHECK-SAME: (ppc_fp128 [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.roundeven.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -206,7 +206,7 @@ define ppc_fp128 @ret_roundeven_ppcf128(ppc_fp128 %arg0) {
 
 define ppc_fp128 @ret_roundeven_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_roundeven_noinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.roundeven.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -216,7 +216,7 @@ define ppc_fp128 @ret_roundeven_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 
 define ppc_fp128 @ret_roundeven_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_roundeven_nopinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.roundeven.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -226,7 +226,7 @@ define ppc_fp128 @ret_roundeven_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0)
 
 define ppc_fp128 @ret_roundeven_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_roundeven_noninf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.roundeven.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -236,7 +236,7 @@ define ppc_fp128 @ret_roundeven_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0)
 
 define ppc_fp128 @ret_roundeven_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) ppc_fp128 @ret_roundeven_nonan_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) ppc_fp128 @llvm.roundeven.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -246,7 +246,7 @@ define ppc_fp128 @ret_roundeven_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 
 define float @ret_roundeven_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_roundeven_noneg
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -256,7 +256,7 @@ define float @ret_roundeven_noneg(float nofpclass(ninf nsub nnorm) %arg0) {
 
 define float @ret_roundeven_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_roundeven_noneg_nonegzero
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -266,7 +266,7 @@ define float @ret_roundeven_noneg_nonegzero(float nofpclass(ninf nsub nnorm nzer
 
 define float @ret_roundeven_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnorm nzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_roundeven_noneg_nonegzero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -276,7 +276,7 @@ define float @ret_roundeven_noneg_nonegzero_nonan(float nofpclass(ninf nsub nnor
 
 define float @ret_roundeven_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @ret_roundeven_noneg_nozero
-; CHECK-SAME: (float nofpclass(ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nzero sub nnorm) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -286,7 +286,7 @@ define float @ret_roundeven_noneg_nozero(float nofpclass(ninf nsub nnorm zero) %
 
 define float @ret_roundeven_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @ret_roundeven_noneg_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan ninf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf zero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -296,7 +296,7 @@ define float @ret_roundeven_noneg_nozero_nonan(float nofpclass(ninf nsub nnorm z
 
 define float @ret_roundeven_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_roundeven_nopos
-; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -306,7 +306,7 @@ define float @ret_roundeven_nopos(float nofpclass(pinf psub pnorm) %arg0) {
 
 define float @ret_roundeven_nopos_nopzero(float nofpclass(pinf psub pnorm pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_roundeven_nopos_nopzero
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -316,7 +316,7 @@ define float @ret_roundeven_nopos_nopzero(float nofpclass(pinf psub pnorm pzero)
 
 define float @ret_roundeven_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm pzero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_roundeven_nopos_nopzero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -326,7 +326,7 @@ define float @ret_roundeven_nopos_nopzero_nonan(float nofpclass(pinf psub pnorm
 
 define float @ret_roundeven_nopos_nozero(float nofpclass(pinf psub pnorm zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf pzero sub pnorm) float @ret_roundeven_nopos_nozero
-; CHECK-SAME: (float nofpclass(pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf pzero sub pnorm) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -336,7 +336,7 @@ define float @ret_roundeven_nopos_nozero(float nofpclass(pinf psub pnorm zero) %
 
 define float @ret_roundeven_nopos_nozero_nonan(float nofpclass(pinf psub pnorm zero nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub pnorm) float @ret_roundeven_nopos_nozero_nonan
-; CHECK-SAME: (float nofpclass(nan pinf zero psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan pinf zero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero sub pnorm) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -346,7 +346,7 @@ define float @ret_roundeven_nopos_nozero_nonan(float nofpclass(pinf psub pnorm z
 
 define float @ret_roundeven_nopzero_nopnorm(float nofpclass(pzero pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nopzero_nopnorm
-; CHECK-SAME: (float nofpclass(pzero pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -356,7 +356,7 @@ define float @ret_roundeven_nopzero_nopnorm(float nofpclass(pzero pnorm) %arg0)
 
 define float @ret_roundeven_nonzero_nonnorm(float nofpclass(nzero nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nonzero_nonnorm
-; CHECK-SAME: (float nofpclass(nzero nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -366,7 +366,7 @@ define float @ret_roundeven_nonzero_nonnorm(float nofpclass(nzero nnorm) %arg0)
 
 define float @ret_roundeven_nozero_nonorm(float nofpclass(zero norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_roundeven_nozero_nonorm
-; CHECK-SAME: (float nofpclass(zero norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.roundeven.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-select.ll b/llvm/test/Transforms/Attributor/nofpclass-select.ll
index 3a8847590ce798..fc7965e2dea654 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-select.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-select.ll
@@ -13,7 +13,7 @@ declare float @llvm.arithmetic.fence.f32(float)
 
 define float @ret_select_nnan_flag(i1 %cond, float %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_select_nnan_flag
-; CHECK-SAME: (i1 [[COND:%.*]], float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select nnan i1 [[COND]], float [[ARG0]], float [[ARG1]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
@@ -23,7 +23,7 @@ define float @ret_select_nnan_flag(i1 %cond, float %arg0, float %arg1) {
 
 define float @ret_select_ninf_flag(i1 %cond, float %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_select_ninf_flag
-; CHECK-SAME: (i1 [[COND:%.*]], float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(inf) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select ninf i1 [[COND]], float [[ARG0]], float [[ARG1]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
@@ -33,7 +33,7 @@ define float @ret_select_ninf_flag(i1 %cond, float %arg0, float %arg1) {
 
 define float @ret_select_nnan_ninf_flag(i1 %cond, float %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(nan inf) float @ret_select_nnan_ninf_flag
-; CHECK-SAME: (i1 [[COND:%.*]], float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select nnan ninf i1 [[COND]], float [[ARG0]], float [[ARG1]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
@@ -43,7 +43,7 @@ define float @ret_select_nnan_ninf_flag(i1 %cond, float %arg0, float %arg1) {
 
 define float @ret_fence_select_nnan_flag(i1 %cond, float %arg0, float %arg1) {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_fence_select_nnan_flag
-; CHECK-SAME: (i1 [[COND:%.*]], float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[SELECT:%.*]] = select nnan i1 [[COND]], float [[ARG0]], float [[ARG1]]
 ; CHECK-NEXT:    [[FENCE:%.*]] = call nofpclass(nan) float @llvm.arithmetic.fence.f32(float [[SELECT]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[FENCE]]
@@ -66,7 +66,7 @@ define float @ret_select_nonan__noinf_nonan(i1 %cond, float nofpclass(nan) %arg0
 ; Clamp nan to 0 pattern
 define float @ret_select_clamp_nan_to_zero_uno(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_select_clamp_nan_to_zero_uno
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_NAN:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_NAN]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -78,7 +78,7 @@ define float @ret_select_clamp_nan_to_zero_uno(float %arg) {
 
 define float @ret_select_clamp_nan_to_zero_ord(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_select_clamp_nan_to_zero_ord
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[NOT_NAN:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[NOT_NAN]], float [[ARG]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -90,7 +90,7 @@ define float @ret_select_clamp_nan_to_zero_ord(float %arg) {
 
 define float @ret_select_clamp_onlynans(float %arg) {
 ; CHECK-LABEL: define nofpclass(inf zero sub norm) float @ret_select_clamp_onlynans
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf zero sub norm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[NOT_NAN:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[NOT_NAN]], float 0x7FF8000000000000, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -101,8 +101,8 @@ define float @ret_select_clamp_onlynans(float %arg) {
 }
 
 define float @clamp_nonfinite_to_normal_olt(float %arg) {
-; CHECK-LABEL: define nofpclass(nan inf) float @clamp_nonfinite_to_normal_olt
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @clamp_nonfinite_to_normal_olt
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_FINITE:%.*]] = fcmp olt float [[FABS]], 0x7FF0000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_FINITE]], float [[ARG]], float 1.024000e+03
@@ -115,8 +115,8 @@ define float @clamp_nonfinite_to_normal_olt(float %arg) {
 }
 
 define float @clamp_eq_inf_to_pnormal(float %arg) {
-; CHECK-LABEL: define nofpclass(inf) float @clamp_eq_inf_to_pnormal
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @clamp_eq_inf_to_pnormal
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_INF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_INF]], float 1.024000e+03, float [[ARG]]
@@ -130,7 +130,7 @@ define float @clamp_eq_inf_to_pnormal(float %arg) {
 
 define float @clamp_eq_pinf_to_pnormal(float %arg) {
 ; CHECK-LABEL: define nofpclass(pinf) float @clamp_eq_pinf_to_pnormal
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_INF:%.*]] = fcmp oeq float [[ARG]], 0x7FF0000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_INF]], float 1.024000e+03, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -142,7 +142,7 @@ define float @clamp_eq_pinf_to_pnormal(float %arg) {
 
 define float @clamp_eq_ninf_to_negnormal(float %arg) {
 ; CHECK-LABEL: define nofpclass(ninf) float @clamp_eq_ninf_to_negnormal
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_INF:%.*]] = fcmp oeq float [[ARG]], 0xFFF0000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_INF]], float -1.024000e+03, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -153,8 +153,8 @@ define float @clamp_eq_ninf_to_negnormal(float %arg) {
 }
 
 define float @clamp_eq_inf_to_nan(float %arg) {
-; CHECK-LABEL: define nofpclass(inf) float @clamp_eq_inf_to_nan
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @clamp_eq_inf_to_nan
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_INF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_INF]], float 0x7FF8000000000000, float [[ARG]]
@@ -180,8 +180,8 @@ define float @ret_select_clamp_nan_to_zero_uno_returned_different_arg(float %arg
 
 define float @isfinite_select_fabs_val_0(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @isfinite_select_fabs_val_0
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan inf nzero nsub nnorm) float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_FINITE:%.*]] = fcmp olt float [[FABS]], 0x7FF0000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_FINITE]], float [[FABS]], float 1.024000e+03
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -194,8 +194,8 @@ define float @isfinite_select_fabs_val_0(float %arg) {
 
 define float @isfinite_select_fabs_val_1(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @isfinite_select_fabs_val_1
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan inf nzero nsub nnorm) float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[NOT_IS_FINITE:%.*]] = fcmp uge float [[FABS]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[NOT_IS_FINITE]], float 1.024000e+03, float [[FABS]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -207,8 +207,8 @@ define float @isfinite_select_fabs_val_1(float %arg) {
 }
 
 define float @clamp_denormal_to_poszero(float %arg) {
-; CHECK-LABEL: define nofpclass(nzero sub) float @clamp_denormal_to_poszero
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_denormal_to_poszero
+; CHECK-SAME: (float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_DENORM_OR_ZERO:%.*]] = fcmp olt float [[FABS]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_DENORM_OR_ZERO]], float 0.000000e+00, float [[ARG]]
@@ -221,8 +221,8 @@ define float @clamp_denormal_to_poszero(float %arg) {
 }
 
 define float @clamp_denormal_to_negzero(float %arg) {
-; CHECK-LABEL: define nofpclass(pzero sub) float @clamp_denormal_to_negzero
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-LABEL: define nofpclass(ninf pzero sub nnorm) float @clamp_denormal_to_negzero
+; CHECK-SAME: (float nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_DENORM_OR_ZERO:%.*]] = fcmp olt float [[FABS]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_DENORM_OR_ZERO]], float -0.000000e+00, float [[ARG]]
@@ -235,11 +235,11 @@ define float @clamp_denormal_to_negzero(float %arg) {
 }
 
 define float @clamp_denormal_to_zero_copysign(float %arg) {
-; CHECK-LABEL: define nofpclass(sub) float @clamp_denormal_to_zero_copysign
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-LABEL: define nofpclass(nan inf sub norm) float @clamp_denormal_to_zero_copysign
+; CHECK-SAME: (float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_DENORM_OR_ZERO:%.*]] = fcmp olt float [[FABS]], 0x3810000000000000
-; CHECK-NEXT:    [[ZERO:%.*]] = call float @llvm.copysign.f32(float noundef 0.000000e+00, float [[ARG]]) #[[ATTR2]]
+; CHECK-NEXT:    [[ZERO:%.*]] = call nofpclass(nan inf sub norm) float @llvm.copysign.f32(float noundef 0.000000e+00, float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_DENORM_OR_ZERO]], float [[ZERO]], float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
@@ -251,8 +251,8 @@ define float @clamp_denormal_to_zero_copysign(float %arg) {
 }
 
 define float @clamp_only_denormal_or_zero(float %arg) {
-; CHECK-LABEL: define nofpclass(nan inf norm) float @clamp_only_denormal_or_zero
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub norm) float @clamp_only_denormal_or_zero
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub norm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_DENORM_OR_ZERO:%.*]] = fcmp olt float [[FABS]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_DENORM_OR_ZERO]], float [[ARG]], float 0.000000e+00
@@ -265,9 +265,9 @@ define float @clamp_only_denormal_or_zero(float %arg) {
 }
 
 define float @clamp_inf_to_fabs(float %arg) {
-; CHECK-LABEL: define float @clamp_inf_to_fabs
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_inf_to_fabs
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_INF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_INF]], float [[FABS]], float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -279,8 +279,8 @@ define float @clamp_inf_to_fabs(float %arg) {
 }
 
 define float @not_clamp_inf_to_fabs(float %arg) {
-; CHECK-LABEL: define float @not_clamp_inf_to_fabs
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @not_clamp_inf_to_fabs
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_INF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_INF]], float [[ARG]], float [[FABS]]
@@ -294,7 +294,7 @@ define float @not_clamp_inf_to_fabs(float %arg) {
 
 define float @clamp_zero_to_inf(float %arg) {
 ; CHECK-LABEL: define nofpclass(zero) float @clamp_zero_to_inf
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_ZERO:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ZERO]], float 0x7FF0000000000000, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -306,7 +306,7 @@ define float @clamp_zero_to_inf(float %arg) {
 
 define float @clamp_zero_to_only_inf(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan ninf sub norm) float @clamp_zero_to_only_inf
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan ninf sub norm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_ZERO:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ZERO]], float [[ARG]], float 0x7FF0000000000000
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -318,7 +318,7 @@ define float @clamp_zero_to_only_inf(float %arg) {
 
 define float @clamp_is_class_subnormal_or_inf_to_nan(float %arg) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @clamp_is_class_subnormal_or_inf_to_nan
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_SUBNORMAL_OR_INF:%.*]] = call i1 @llvm.is.fpclass.f32(float [[ARG]], i32 noundef 660) #[[ATTR2]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_SUBNORMAL_OR_INF]], float 0x7FF8000000000000, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -330,7 +330,7 @@ define float @clamp_is_class_subnormal_or_inf_to_nan(float %arg) {
 
 define float @clamp_is_class_subnormal_or_inf_to_nan_swap(float %arg) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @clamp_is_class_subnormal_or_inf_to_nan_swap
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[NOT_IS_SUBNORMAL_OR_INF:%.*]] = call i1 @llvm.is.fpclass.f32(float [[ARG]], i32 noundef 363) #[[ATTR2]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[NOT_IS_SUBNORMAL_OR_INF]], float [[ARG]], float 0x7FF8000000000000
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -342,7 +342,7 @@ define float @clamp_is_class_subnormal_or_inf_to_nan_swap(float %arg) {
 
 define float @ret_select_clamp_nan_to_zero_fpclass(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan) float @ret_select_clamp_nan_to_zero_fpclass
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_NAN:%.*]] = call i1 @llvm.is.fpclass.f32(float [[ARG]], i32 noundef 3) #[[ATTR2]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_NAN]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -354,7 +354,7 @@ define float @ret_select_clamp_nan_to_zero_fpclass(float %arg) {
 
 define float @ret_select_clamp_snan_to_zero_fpclass(float %arg) {
 ; CHECK-LABEL: define nofpclass(snan) float @ret_select_clamp_snan_to_zero_fpclass
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_NAN:%.*]] = call i1 @llvm.is.fpclass.f32(float [[ARG]], i32 noundef 1) #[[ATTR2]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_NAN]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -366,7 +366,7 @@ define float @ret_select_clamp_snan_to_zero_fpclass(float %arg) {
 
 define float @ret_select_clamp_qnan_to_zero_fpclass(float %arg) {
 ; CHECK-LABEL: define nofpclass(qnan) float @ret_select_clamp_qnan_to_zero_fpclass
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[IS_NAN:%.*]] = call i1 @llvm.is.fpclass.f32(float [[ARG]], i32 noundef 2) #[[ATTR2]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_NAN]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -390,7 +390,7 @@ define float @ret_select_clamp_nan_to_zero_fpclass_other_val(float %arg0, float
 
 define float @clamp_is_denorm_or_zero_to_fneg(float %arg) {
 ; CHECK-LABEL: define float @clamp_is_denorm_or_zero_to_fneg
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_DENORM_OR_ZERO:%.*]] = fcmp olt float [[FABS]], 0x3810000000000000
 ; CHECK-NEXT:    [[NEG_ARG:%.*]] = fneg float [[ARG]]
@@ -406,7 +406,7 @@ define float @clamp_is_denorm_or_zero_to_fneg(float %arg) {
 
 define float @select_is_denorm_or_zero_to_fneg_or_fabs(float %arg) {
 ; CHECK-LABEL: define float @select_is_denorm_or_zero_to_fneg_or_fabs
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_DENORM_OR_ZERO:%.*]] = fcmp olt float [[FABS]], 0x3810000000000000
 ; CHECK-NEXT:    [[NEG_ARG:%.*]] = fneg float [[ARG]]
@@ -422,7 +422,7 @@ define float @select_is_denorm_or_zero_to_fneg_or_fabs(float %arg) {
 
 define float @select_is_denorm_or_zero_to_fabs_or_fneg(float %arg) {
 ; CHECK-LABEL: define float @select_is_denorm_or_zero_to_fabs_or_fneg
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[IS_DENORM_OR_ZERO:%.*]] = fcmp olt float [[FABS]], 0x3810000000000000
 ; CHECK-NEXT:    [[NEG_ARG:%.*]] = fneg float [[ARG]]
diff --git a/llvm/test/Transforms/Attributor/nofpclass-sin-cos.ll b/llvm/test/Transforms/Attributor/nofpclass-sin-cos.ll
index 3170e8b06e7ea3..9207237484d270 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-sin-cos.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-sin-cos.ll
@@ -6,7 +6,7 @@ declare float @llvm.cos.f32(float)
 
 define float @ret_sin(float %arg) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_sin
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.sin.f32(float [[ARG]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -16,7 +16,7 @@ define float @ret_sin(float %arg) {
 
 define float @ret_cos(float %arg) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_cos
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.cos.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -46,7 +46,7 @@ define float @ret_cos_noinf(float nofpclass(inf) %arg) {
 
 define float @ret_sin_nonan(float nofpclass(nan) %arg) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_sin_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.sin.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -56,7 +56,7 @@ define float @ret_sin_nonan(float nofpclass(nan) %arg) {
 
 define float @ret_cos_nonan(float nofpclass(nan) %arg) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_cos_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.cos.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -87,7 +87,7 @@ define float @ret_cos_nonan_noinf(float nofpclass(nan inf) %arg) {
 
 define float @ret_sin_noqnan(float nofpclass(qnan) %arg) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_sin_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan inf) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.sin.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -97,7 +97,7 @@ define float @ret_sin_noqnan(float nofpclass(qnan) %arg) {
 
 define float @ret_cos_noqnan(float nofpclass(qnan) %arg) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_cos_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan inf) [[ARG:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.cos.f32(float [[ARG]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-sqrt.ll b/llvm/test/Transforms/Attributor/nofpclass-sqrt.ll
index 5b89276dd6ac3b..c2b5b85ac7cf1c 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-sqrt.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-sqrt.ll
@@ -7,7 +7,7 @@ declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata)
 
 define float @ret_sqrt(float %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_sqrt
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -17,7 +17,7 @@ define float @ret_sqrt(float %arg0) #0 {
 
 define float @ret_sqrt_noinf(float nofpclass(inf) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_sqrt_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(inf nsub nnorm) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -27,7 +27,7 @@ define float @ret_sqrt_noinf(float nofpclass(inf) %arg0) #0 {
 
 define float @ret_sqrt_nopinf(float nofpclass(pinf) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_sqrt_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(inf nsub nnorm) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -37,7 +37,7 @@ define float @ret_sqrt_nopinf(float nofpclass(pinf) %arg0) #0 {
 
 define float @ret_sqrt_noninf(float nofpclass(ninf) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_sqrt_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -47,7 +47,7 @@ define float @ret_sqrt_noninf(float nofpclass(ninf) %arg0) #0 {
 
 define float @ret_sqrt_nonan(float nofpclass(nan) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(snan ninf nsub nnorm) float @ret_sqrt_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan ninf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -57,7 +57,7 @@ define float @ret_sqrt_nonan(float nofpclass(nan) %arg0) #0 {
 
 define float @ret_sqrt_nonan_noinf(float nofpclass(nan inf) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(snan inf nsub nnorm) float @ret_sqrt_nonan_noinf
-; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(nan inf nsub nnorm) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -67,7 +67,7 @@ define float @ret_sqrt_nonan_noinf(float nofpclass(nan inf) %arg0) #0 {
 
 define float @ret_sqrt_nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(snan inf nzero nsub nnorm) float @ret_sqrt_nonan_noinf_nozero
-; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(nan inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan inf nzero nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -77,7 +77,7 @@ define float @ret_sqrt_nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0) #
 
 define float @ret_sqrt_noinf_nozero(float nofpclass(inf zero) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_sqrt_noinf_nozero
-; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -87,7 +87,7 @@ define float @ret_sqrt_noinf_nozero(float nofpclass(inf zero) %arg0) #0 {
 
 define float @ret_sqrt_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #0 {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_sqrt_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -110,7 +110,7 @@ define float @ret_sqrt_positive_source(i32 %arg) #0 {
 ; Could produce a nan because we don't know if the multiply is negative.
 define float @ret_sqrt_unknown_sign(float nofpclass(nan) %arg0, float nofpclass(nan) %arg1) #0 {
 ; CHECK-LABEL: define nofpclass(snan ninf nsub nnorm) float @ret_sqrt_unknown_sign
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[UNKNOWN_SIGN_NOT_NAN:%.*]] = fmul nnan float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan ninf nsub nnorm) float @llvm.sqrt.f32(float [[UNKNOWN_SIGN_NOT_NAN]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
@@ -122,7 +122,7 @@ define float @ret_sqrt_unknown_sign(float nofpclass(nan) %arg0, float nofpclass(
 
 define float @ret_sqrt_daz_noinf_nozero(float nofpclass(inf zero) %arg0) #1 {
 ; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_sqrt_daz_noinf_nozero
-; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -132,7 +132,7 @@ define float @ret_sqrt_daz_noinf_nozero(float nofpclass(inf zero) %arg0) #1 {
 
 define <2 x float> @ret_sqrt_daz_noinf_nozero_v2f32(<2 x float> nofpclass(inf zero) %arg0) #1 {
 ; CHECK-LABEL: define nofpclass(inf nsub nnorm) <2 x float> @ret_sqrt_daz_noinf_nozero_v2f32
-; CHECK-SAME: (<2 x float> nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (<2 x float> nofpclass(inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nsub nnorm) <2 x float> @llvm.sqrt.v2f32(<2 x float> [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret <2 x float> [[CALL]]
 ;
@@ -142,7 +142,7 @@ define <2 x float> @ret_sqrt_daz_noinf_nozero_v2f32(<2 x float> nofpclass(inf ze
 
 define float @ret_sqrt_daz_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #1 {
 ; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_sqrt_daz_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -152,7 +152,7 @@ define float @ret_sqrt_daz_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #1
 
 define float @ret_sqrt_dapz_noinf_nozero(float nofpclass(inf zero) %arg0) #2 {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_sqrt_dapz_noinf_nozero
-; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -162,7 +162,7 @@ define float @ret_sqrt_dapz_noinf_nozero(float nofpclass(inf zero) %arg0) #2 {
 
 define float @ret_sqrt_dapz_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #2 {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_sqrt_dapz_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR4]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -172,7 +172,7 @@ define float @ret_sqrt_dapz_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #2
 
 define float @ret_sqrt_dynamic_noinf_nozero(float nofpclass(inf zero) %arg0) #3 {
 ; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_sqrt_dynamic_noinf_nozero
-; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]]) #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(inf zero nsub nnorm) [[ARG0:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -182,7 +182,7 @@ define float @ret_sqrt_dynamic_noinf_nozero(float nofpclass(inf zero) %arg0) #3
 
 define float @ret_sqrt_dynamic_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #3 {
 ; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_sqrt_dynamic_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR5]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR5]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -192,7 +192,7 @@ define float @ret_sqrt_dynamic_noinf_nonegzero(float nofpclass(inf nzero) %arg0)
 
 define float @ret_sqrt_ftz_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #4 {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_sqrt_ftz_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR6:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -202,7 +202,7 @@ define float @ret_sqrt_ftz_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #4
 
 define float @ret_sqrt_ftpz_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #5 {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_sqrt_ftpz_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -212,7 +212,7 @@ define float @ret_sqrt_ftpz_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #5
 
 define float @ret_sqrt_ftz_dynamic_noinf_nonegzero(float nofpclass(inf nzero) %arg0) #6 {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_sqrt_ftz_dynamic_noinf_nonegzero
-; CHECK-SAME: (float nofpclass(inf nzero) [[ARG0:%.*]]) #[[ATTR8:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(inf nzero nsub nnorm) [[ARG0:%.*]]) #[[ATTR8:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.sqrt.f32(float [[ARG0]]) #[[ATTR10]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -222,7 +222,7 @@ define float @ret_sqrt_ftz_dynamic_noinf_nonegzero(float nofpclass(inf nzero) %a
 
 define float @constrained_sqrt(float %arg) strictfp {
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @constrained_sqrt
-; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR9:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG:%.*]]) #[[ATTR9:[0-9]+]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(ninf nsub nnorm) float @llvm.experimental.constrained.sqrt.f32(float [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR11:[0-9]+]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
@@ -232,7 +232,7 @@ define float @constrained_sqrt(float %arg) strictfp {
 
 define float @constrained_sqrt_nonan(float nofpclass(nan) %arg) strictfp {
 ; CHECK-LABEL: define nofpclass(snan ninf nsub nnorm) float @constrained_sqrt_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG:%.*]]) #[[ATTR9]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(snan ninf nsub nnorm) float @llvm.experimental.constrained.sqrt.f32(float [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR11]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
@@ -242,7 +242,7 @@ define float @constrained_sqrt_nonan(float nofpclass(nan) %arg) strictfp {
 
 define float @constrained_sqrt_nopinf(float nofpclass(pinf) %arg) strictfp {
 ; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @constrained_sqrt_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(inf nsub nnorm) [[ARG:%.*]]) #[[ATTR9]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(inf nsub nnorm) float @llvm.experimental.constrained.sqrt.f32(float [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR11]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
@@ -252,7 +252,7 @@ define float @constrained_sqrt_nopinf(float nofpclass(pinf) %arg) strictfp {
 
 define float @constrained_sqrt_nonegzero(float nofpclass(nzero) %arg) strictfp {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @constrained_sqrt_nonegzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR9]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.experimental.constrained.sqrt.f32(float [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR11]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
@@ -262,7 +262,7 @@ define float @constrained_sqrt_nonegzero(float nofpclass(nzero) %arg) strictfp {
 
 define float @constrained_sqrt_nozero(float nofpclass(zero) %arg) strictfp {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @constrained_sqrt_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(ninf zero nsub nnorm) [[ARG:%.*]]) #[[ATTR9]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.experimental.constrained.sqrt.f32(float [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR11]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-trunc.ll b/llvm/test/Transforms/Attributor/nofpclass-trunc.ll
index 789f927be77513..9305a02e4e4474 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-trunc.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-trunc.ll
@@ -6,7 +6,7 @@ declare ppc_fp128 @llvm.trunc.ppcf128(ppc_fp128)
 
 define float @ret_trunc(float %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -16,7 +16,7 @@ define float @ret_trunc(float %arg0) {
 
 define float @ret_trunc_noinf(float nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) float @ret_trunc_noinf
-; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -26,7 +26,7 @@ define float @ret_trunc_noinf(float nofpclass(inf) %arg0) {
 
 define float @ret_trunc_nopinf(float nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) float @ret_trunc_nopinf
-; CHECK-SAME: (float nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -36,7 +36,7 @@ define float @ret_trunc_nopinf(float nofpclass(pinf) %arg0) {
 
 define float @ret_trunc_noninf(float nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) float @ret_trunc_noninf
-; CHECK-SAME: (float nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -46,7 +46,7 @@ define float @ret_trunc_noninf(float nofpclass(ninf) %arg0) {
 
 define float @ret_trunc_nonan(float nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) float @ret_trunc_nonan
-; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -56,7 +56,7 @@ define float @ret_trunc_nonan(float nofpclass(nan) %arg0) {
 
 define float @ret_trunc_noqnan(float nofpclass(qnan) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_noqnan
-; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(qnan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -66,7 +66,7 @@ define float @ret_trunc_noqnan(float nofpclass(qnan) %arg0) {
 
 define float @ret_trunc_nosnan(float nofpclass(snan) %arg0) {
 ; CHECK-LABEL: define nofpclass(snan sub) float @ret_trunc_nosnan
-; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(snan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -76,7 +76,7 @@ define float @ret_trunc_nosnan(float nofpclass(snan) %arg0) {
 
 define float @ret_trunc_nozero(float nofpclass(zero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nozero
-; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -86,7 +86,7 @@ define float @ret_trunc_nozero(float nofpclass(zero) %arg0) {
 
 define float @ret_trunc_nopzero(float nofpclass(pzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nopzero
-; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -96,7 +96,7 @@ define float @ret_trunc_nopzero(float nofpclass(pzero) %arg0) {
 
 define float @ret_trunc_nonzero(float nofpclass(nzero) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nonzero
-; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -106,7 +106,7 @@ define float @ret_trunc_nonzero(float nofpclass(nzero) %arg0) {
 
 define float @ret_trunc_nonorm(float nofpclass(norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nonorm
-; CHECK-SAME: (float nofpclass(norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -116,7 +116,7 @@ define float @ret_trunc_nonorm(float nofpclass(norm) %arg0) {
 
 define float @ret_trunc_nonnorm(float nofpclass(nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nonnorm
-; CHECK-SAME: (float nofpclass(nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -126,7 +126,7 @@ define float @ret_trunc_nonnorm(float nofpclass(nnorm) %arg0) {
 
 define float @ret_trunc_nopnorm(float nofpclass(pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nopnorm
-; CHECK-SAME: (float nofpclass(pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -136,7 +136,7 @@ define float @ret_trunc_nopnorm(float nofpclass(pnorm) %arg0) {
 
 define float @ret_trunc_nonsub(float nofpclass(nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nonsub
-; CHECK-SAME: (float nofpclass(nsub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -146,7 +146,7 @@ define float @ret_trunc_nonsub(float nofpclass(nsub) %arg0) {
 
 define float @ret_trunc_nopsub(float nofpclass(psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nopsub
-; CHECK-SAME: (float nofpclass(psub) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -166,7 +166,7 @@ define float @ret_trunc_nonorm_nosub(float nofpclass(norm sub) %arg0) {
 
 define float @ret_trunc_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nopnorm_nopsub
-; CHECK-SAME: (float nofpclass(psub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -176,7 +176,7 @@ define float @ret_trunc_nopnorm_nopsub(float nofpclass(pnorm psub) %arg0) {
 
 define float @ret_trunc_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nonnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -186,7 +186,7 @@ define float @ret_trunc_nonnorm_nonsub(float nofpclass(nnorm nsub) %arg0) {
 
 define float @ret_trunc_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nopnorm_nonsub
-; CHECK-SAME: (float nofpclass(nsub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -196,7 +196,7 @@ define float @ret_trunc_nopnorm_nonsub(float nofpclass(pnorm nsub) %arg0) {
 
 define ppc_fp128 @ret_trunc_ppcf128(ppc_fp128 %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) ppc_fp128 @ret_trunc_ppcf128
-; CHECK-SAME: (ppc_fp128 [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) ppc_fp128 @llvm.trunc.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -206,7 +206,7 @@ define ppc_fp128 @ret_trunc_ppcf128(ppc_fp128 %arg0) {
 
 define ppc_fp128 @ret_trunc_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 ; CHECK-LABEL: define nofpclass(inf sub) ppc_fp128 @ret_trunc_noinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(inf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(inf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf sub) ppc_fp128 @llvm.trunc.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -216,7 +216,7 @@ define ppc_fp128 @ret_trunc_noinf_ppcf128(ppc_fp128 nofpclass(inf) %arg0) {
 
 define ppc_fp128 @ret_trunc_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 ; CHECK-LABEL: define nofpclass(pinf sub) ppc_fp128 @ret_trunc_nopinf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(pinf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(pinf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(pinf sub) ppc_fp128 @llvm.trunc.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -226,7 +226,7 @@ define ppc_fp128 @ret_trunc_nopinf_ppcf128(ppc_fp128 nofpclass(pinf) %arg0) {
 
 define ppc_fp128 @ret_trunc_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 ; CHECK-LABEL: define nofpclass(ninf sub) ppc_fp128 @ret_trunc_noninf_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(ninf) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(ninf sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(ninf sub) ppc_fp128 @llvm.trunc.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -236,7 +236,7 @@ define ppc_fp128 @ret_trunc_noninf_ppcf128(ppc_fp128 nofpclass(ninf) %arg0) {
 
 define ppc_fp128 @ret_trunc_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 ; CHECK-LABEL: define nofpclass(nan sub) ppc_fp128 @ret_trunc_nonan_ppcf128
-; CHECK-SAME: (ppc_fp128 nofpclass(nan) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (ppc_fp128 nofpclass(nan sub) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan sub) ppc_fp128 @llvm.trunc.ppcf128(ppc_fp128 [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret ppc_fp128 [[CALL]]
 ;
@@ -246,7 +246,7 @@ define ppc_fp128 @ret_trunc_nonan_ppcf128(ppc_fp128 nofpclass(nan) %arg0) {
 
 define float @ret_trunc_nopzero_nopnorm(float nofpclass(pzero pnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nopzero_nopnorm
-; CHECK-SAME: (float nofpclass(pzero pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(pzero sub pnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -256,7 +256,7 @@ define float @ret_trunc_nopzero_nopnorm(float nofpclass(pzero pnorm) %arg0) {
 
 define float @ret_trunc_nonzero_nonnorm(float nofpclass(nzero nnorm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nonzero_nonnorm
-; CHECK-SAME: (float nofpclass(nzero nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(nzero sub nnorm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
@@ -266,7 +266,7 @@ define float @ret_trunc_nonzero_nonnorm(float nofpclass(nzero nnorm) %arg0) {
 
 define float @ret_trunc_nozero_nonorm(float nofpclass(zero norm) %arg0) {
 ; CHECK-LABEL: define nofpclass(sub) float @ret_trunc_nozero_nonorm
-; CHECK-SAME: (float nofpclass(zero norm) [[ARG0:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(sub) float @llvm.trunc.f32(float [[ARG0]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index 051ae8b8fe3d5a..442464cde38948 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -151,7 +151,7 @@ define [2 x [3 x float]] @return_nofpclass_inf_ret_array() {
 define float @returned_nnan_fadd(float %arg0, float %arg1) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nan) float @returned_nnan_fadd
-; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FADD:%.*]] = fadd nnan float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FADD]]
 ;
@@ -212,7 +212,7 @@ define void @ninf_arg_used_by_callsite_array([2 x [3 x float]] nofpclass(inf) %a
 define void @nofpclass_call_use_after_unannotated_use(float %arg) {
 ; CHECK-LABEL: define void @nofpclass_call_use_after_unannotated_use
 ; CHECK-SAME: (float [[ARG:%.*]]) {
-; CHECK-NEXT:    call void @extern(float [[ARG]]) #[[ATTR15:[0-9]+]]
+; CHECK-NEXT:    call void @extern(float [[ARG]]) #[[ATTR17:[0-9]+]]
 ; CHECK-NEXT:    call void @extern(float nofpclass(nan inf) [[ARG]])
 ; CHECK-NEXT:    ret void
 ;
@@ -335,7 +335,7 @@ define float @fcmp_ord_assume_callsite_arg_return(float %arg) {
 ; CHECK-SAME: (float returned nofpclass(nan) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IS_NOT_NAN:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NOT_NAN]]) #[[ATTR16:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NOT_NAN]]) #[[ATTR18:[0-9]+]]
 ; CHECK-NEXT:    call void @extern.use(float nofpclass(nan) [[ARG]])
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -379,13 +379,13 @@ define float @call_noinf_0(float nofpclass(inf) %arg) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define float @call_noinf_0
 ; TUNIT-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[RESULT:%.*]] = call float @only_nofpclass_inf_callers(float nofpclass(inf) [[ARG]]) #[[ATTR17:[0-9]+]]
+; TUNIT-NEXT:    [[RESULT:%.*]] = call float @only_nofpclass_inf_callers(float nofpclass(inf) [[ARG]]) #[[ATTR19:[0-9]+]]
 ; TUNIT-NEXT:    ret float [[RESULT]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define float @call_noinf_0
 ; CGSCC-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
-; CGSCC-NEXT:    [[RESULT:%.*]] = call float @only_nofpclass_inf_callers(float nofpclass(inf) [[ARG]]) #[[ATTR17:[0-9]+]]
+; CGSCC-NEXT:    [[RESULT:%.*]] = call float @only_nofpclass_inf_callers(float nofpclass(inf) [[ARG]]) #[[ATTR19:[0-9]+]]
 ; CGSCC-NEXT:    ret float [[RESULT]]
 ;
   %result = call float @only_nofpclass_inf_callers(float %arg)
@@ -396,13 +396,13 @@ define float @call_noinf_1(float %arg) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define float @call_noinf_1
 ; TUNIT-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[RESULT:%.*]] = call float @only_nofpclass_inf_callers(float nofpclass(inf) [[ARG]]) #[[ATTR17]]
+; TUNIT-NEXT:    [[RESULT:%.*]] = call float @only_nofpclass_inf_callers(float nofpclass(inf) [[ARG]]) #[[ATTR19]]
 ; TUNIT-NEXT:    ret float [[RESULT]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define float @call_noinf_1
 ; CGSCC-SAME: (float [[ARG:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[RESULT:%.*]] = call float @only_nofpclass_inf_callers(float nofpclass(inf) [[ARG]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[RESULT:%.*]] = call float @only_nofpclass_inf_callers(float nofpclass(inf) [[ARG]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[RESULT]]
 ;
   %result = call float @only_nofpclass_inf_callers(float nofpclass(inf) %arg)
@@ -425,13 +425,13 @@ define float @call_noinf_return_0(float %arg) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(inf) float @call_noinf_return_0
 ; TUNIT-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[RESULT:%.*]] = call nofpclass(inf) float @only_nofpclass_inf_return_users(float [[ARG]]) #[[ATTR17]]
+; TUNIT-NEXT:    [[RESULT:%.*]] = call nofpclass(inf) float @only_nofpclass_inf_return_users(float [[ARG]]) #[[ATTR19]]
 ; TUNIT-NEXT:    ret float [[RESULT]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(inf) float @call_noinf_return_0
-; CGSCC-SAME: (float [[ARG:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[RESULT:%.*]] = call nofpclass(inf) float @only_nofpclass_inf_return_users(float [[ARG]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR4]] {
+; CGSCC-NEXT:    [[RESULT:%.*]] = call nofpclass(inf) float @only_nofpclass_inf_return_users(float nofpclass(inf) [[ARG]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[RESULT]]
 ;
   %result = call nofpclass(inf) float @only_nofpclass_inf_return_users(float %arg)
@@ -442,13 +442,13 @@ define float @call_noinf_return_1(float %arg) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(inf) float @call_noinf_return_1
 ; TUNIT-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[RESULT:%.*]] = call nofpclass(inf) float @only_nofpclass_inf_return_users(float [[ARG]]) #[[ATTR17]]
+; TUNIT-NEXT:    [[RESULT:%.*]] = call nofpclass(inf) float @only_nofpclass_inf_return_users(float [[ARG]]) #[[ATTR19]]
 ; TUNIT-NEXT:    ret float [[RESULT]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(inf) float @call_noinf_return_1
-; CGSCC-SAME: (float [[ARG:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[RESULT:%.*]] = call nofpclass(inf) float @only_nofpclass_inf_return_users(float [[ARG]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR4]] {
+; CGSCC-NEXT:    [[RESULT:%.*]] = call nofpclass(inf) float @only_nofpclass_inf_return_users(float nofpclass(inf) [[ARG]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[RESULT]]
 ;
   %result = call nofpclass(inf) float @only_nofpclass_inf_return_users(float %arg)
@@ -460,7 +460,7 @@ define float @fcmp_olt_assume_one_0_callsite_arg_return(float %arg) {
 ; CHECK-SAME: (float returned nofpclass(nan zero) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IS_NOT_ZERO_OR_NAN:%.*]] = fcmp one float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NOT_ZERO_OR_NAN]]) #[[ATTR16]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NOT_ZERO_OR_NAN]]) #[[ATTR18]]
 ; CHECK-NEXT:    call void @extern.use(float nofpclass(nan zero) [[ARG]])
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -476,7 +476,7 @@ define float @fcmp_olt_assume_une_0_callsite_arg_return(float %arg) {
 ; CHECK-SAME: (float returned nofpclass(zero) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IS_NOT_ZERO_OR_NAN:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NOT_ZERO_OR_NAN]]) #[[ATTR16]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NOT_ZERO_OR_NAN]]) #[[ATTR18]]
 ; CHECK-NEXT:    call void @extern.use(float nofpclass(zero) [[ARG]])
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -488,13 +488,13 @@ entry:
 }
 
 define half @fcmp_assume_issubnormal_callsite_arg_return(half %arg) {
-; CHECK-LABEL: define nofpclass(nan inf norm) half @fcmp_assume_issubnormal_callsite_arg_return
-; CHECK-SAME: (half returned nofpclass(nan inf norm) [[ARG:%.*]]) {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub norm) half @fcmp_assume_issubnormal_callsite_arg_return
+; CHECK-SAME: (half returned nofpclass(nan inf nzero nsub norm) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) half @llvm.fabs.f16(half nofpclass(nan inf norm) [[ARG]]) #[[ATTR18:[0-9]+]]
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) half @llvm.fabs.f16(half nofpclass(nan inf nzero nsub norm) [[ARG]]) #[[ATTR20:[0-9]+]]
 ; CHECK-NEXT:    [[IS_SUBNORMAL:%.*]] = fcmp olt half [[FABS]], 0xH0400
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_SUBNORMAL]]) #[[ATTR16]]
-; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf norm) [[ARG]])
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_SUBNORMAL]]) #[[ATTR18]]
+; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf nzero nsub norm) [[ARG]])
 ; CHECK-NEXT:    ret half [[ARG]]
 ;
 entry:
@@ -524,15 +524,15 @@ entry:
 
 ; Assume not subnormal or zero, and not infinity
 define half @fcmp_assume2_callsite_arg_return(half %arg) {
-; CHECK-LABEL: define nofpclass(nan pinf zero sub) half @fcmp_assume2_callsite_arg_return
-; CHECK-SAME: (half returned nofpclass(nan pinf zero sub) [[ARG:%.*]]) {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) half @fcmp_assume2_callsite_arg_return
+; CHECK-SAME: (half returned nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) half @llvm.fabs.f16(half nofpclass(nan pinf zero sub) [[ARG]]) #[[ATTR18]]
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) half @llvm.fabs.f16(half nofpclass(nan inf zero sub nnorm) [[ARG]]) #[[ATTR20]]
 ; CHECK-NEXT:    [[NOT_SUBNORMAL_OR_ZERO:%.*]] = fcmp oge half [[FABS]], 0xH0400
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[NOT_SUBNORMAL_OR_ZERO]]) #[[ATTR16]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[NOT_SUBNORMAL_OR_ZERO]]) #[[ATTR18]]
 ; CHECK-NEXT:    [[NOT_INF:%.*]] = fcmp one half [[ARG]], 0xH7C00
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[NOT_INF]]) #[[ATTR16]]
-; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan pinf zero sub) [[ARG]])
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[NOT_INF]]) #[[ATTR18]]
+; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf zero sub nnorm) [[ARG]])
 ; CHECK-NEXT:    ret half [[ARG]]
 ;
 entry:
@@ -551,8 +551,8 @@ define float @is_fpclass_assume_arg_return(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero sub nnorm) float @is_fpclass_assume_arg_return
 ; CHECK-SAME: (float returned nofpclass(nan pinf pzero sub nnorm) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CLASS_TEST:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(nan pinf pzero sub nnorm) [[ARG]], i32 noundef 292) #[[ATTR18]]
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[CLASS_TEST]]) #[[ATTR16]]
+; CHECK-NEXT:    [[CLASS_TEST:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(nan pinf pzero sub nnorm) [[ARG]], i32 noundef 292) #[[ATTR20]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[CLASS_TEST]]) #[[ATTR18]]
 ; CHECK-NEXT:    call void @extern.use(float nofpclass(nan pinf pzero sub nnorm) [[ARG]])
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -566,15 +566,15 @@ entry:
 ; Make sure we don't get confused by looking at an unrelated assume
 ; based on the fabs of the value.
 define half @assume_fcmp_fabs_with_other_fabs_assume(half %arg) {
-; CHECK-LABEL: define nofpclass(nan inf norm) half @assume_fcmp_fabs_with_other_fabs_assume
-; CHECK-SAME: (half returned nofpclass(nan inf norm) [[ARG:%.*]]) {
+; CHECK-LABEL: define nofpclass(nan inf zero nsub norm) half @assume_fcmp_fabs_with_other_fabs_assume
+; CHECK-SAME: (half returned nofpclass(nan inf zero nsub norm) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan inf zero nsub norm) half @llvm.fabs.f16(half nofpclass(nan inf norm) [[ARG]]) #[[ATTR18]]
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan inf zero nsub norm) half @llvm.fabs.f16(half nofpclass(nan inf zero nsub norm) [[ARG]]) #[[ATTR20]]
 ; CHECK-NEXT:    [[UNRELATED_FABS:%.*]] = fcmp one half [[FABS]], 0xH0000
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UNRELATED_FABS]]) #[[ATTR16]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UNRELATED_FABS]]) #[[ATTR18]]
 ; CHECK-NEXT:    [[IS_SUBNORMAL:%.*]] = fcmp olt half [[FABS]], 0xH0400
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_SUBNORMAL]]) #[[ATTR16]]
-; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf norm) [[ARG]])
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_SUBNORMAL]]) #[[ATTR18]]
+; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf zero nsub norm) [[ARG]])
 ; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf zero nsub norm) [[FABS]])
 ; CHECK-NEXT:    ret half [[ARG]]
 ;
@@ -593,15 +593,15 @@ entry:
 ; Make sure if looking through the fabs finds a different source
 ; value, we still identify a test mask by ignoring the fabs
 define half @assume_fcmp_fabs_with_other_fabs_assume_fallback(half %arg) {
-; CHECK-LABEL: define nofpclass(nan inf norm) half @assume_fcmp_fabs_with_other_fabs_assume_fallback
-; CHECK-SAME: (half returned nofpclass(nan inf norm) [[ARG:%.*]]) {
+; CHECK-LABEL: define nofpclass(nan inf nzero sub norm) half @assume_fcmp_fabs_with_other_fabs_assume_fallback
+; CHECK-SAME: (half returned nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan inf nzero sub norm) half @llvm.fabs.f16(half nofpclass(nan inf norm) [[ARG]]) #[[ATTR18]]
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]]
+; CHECK-NEXT:    [[FABS:%.*]] = call nofpclass(nan inf nzero sub norm) half @llvm.fabs.f16(half nofpclass(nan inf nzero sub norm) [[ARG]]) #[[ATTR20]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR18]]
 ; CHECK-NEXT:    [[UNRELATED_FABS:%.*]] = fcmp oeq half [[FABS]], 0xH0000
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UNRELATED_FABS]]) #[[ATTR16]]
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]]
-; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf norm) [[ARG]])
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UNRELATED_FABS]]) #[[ATTR18]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR18]]
+; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf nzero sub norm) [[ARG]])
 ; CHECK-NEXT:    call void @extern.use.f16(half nofpclass(nan inf nzero sub norm) [[FABS]])
 ; CHECK-NEXT:    ret half [[ARG]]
 ;
@@ -628,7 +628,7 @@ define float @assume_bundles(i1 %c, float %ret) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
 ; CHECK:       A:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "nofpclass"(float [[RET]], i32 3) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR18]] [ "nofpclass"(float [[RET]], i32 3) ]
 ; CHECK-NEXT:    call void @extern.use(float nofpclass(nan) [[RET]])
 ; CHECK-NEXT:    ret float [[RET]]
 ; CHECK:       B:
@@ -667,7 +667,7 @@ define float @pass_nofpclass_inf_through_memory(float nofpclass(inf) %arg) {
 ; TUNIT-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:    [[ALLOCA:%.*]] = alloca float, align 4
 ; TUNIT-NEXT:    store float [[ARG]], ptr [[ALLOCA]], align 4
-; TUNIT-NEXT:    [[RET:%.*]] = call float @returned_load(ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[ALLOCA]]) #[[ATTR19:[0-9]+]]
+; TUNIT-NEXT:    [[RET:%.*]] = call float @returned_load(ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[ALLOCA]]) #[[ATTR21:[0-9]+]]
 ; TUNIT-NEXT:    ret float [[RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
@@ -675,7 +675,7 @@ define float @pass_nofpclass_inf_through_memory(float nofpclass(inf) %arg) {
 ; CGSCC-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR4]] {
 ; CGSCC-NEXT:    [[ALLOCA:%.*]] = alloca float, align 4
 ; CGSCC-NEXT:    store float [[ARG]], ptr [[ALLOCA]], align 4
-; CGSCC-NEXT:    [[RET:%.*]] = call float @returned_load(ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[ALLOCA]]) #[[ATTR19:[0-9]+]]
+; CGSCC-NEXT:    [[RET:%.*]] = call float @returned_load(ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[ALLOCA]]) #[[ATTR21:[0-9]+]]
 ; CGSCC-NEXT:    ret float [[RET]]
 ;
   %alloca = alloca float
@@ -687,14 +687,14 @@ define float @pass_nofpclass_inf_through_memory(float nofpclass(inf) %arg) {
 define float @returned_fabs(float %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs
-; TUNIT-SAME: (float [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[X]]) #[[ATTR20:[0-9]+]]
+; TUNIT-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR22:[0-9]+]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs
-; CGSCC-SAME: (float [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -704,14 +704,14 @@ define float @returned_fabs(float %x) {
 define float @returned_fabs_nosnan(float nofpclass(snan) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(snan ninf nzero nsub nnorm) float @returned_fabs_nosnan
-; TUNIT-SAME: (float nofpclass(snan) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(snan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(snan) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(snan ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(snan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(snan ninf nzero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(snan ninf nzero nsub nnorm) float @returned_fabs_nosnan
-; CGSCC-SAME: (float nofpclass(snan) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(snan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(snan) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(snan ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(snan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(snan ninf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -721,14 +721,14 @@ define float @returned_fabs_nosnan(float nofpclass(snan) %x) {
 define float @returned_fabs_noqnan(float nofpclass(qnan) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(qnan ninf nzero nsub nnorm) float @returned_fabs_noqnan
-; TUNIT-SAME: (float nofpclass(qnan) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(qnan ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan ninf nzero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(qnan ninf nzero nsub nnorm) float @returned_fabs_noqnan
-; CGSCC-SAME: (float nofpclass(qnan) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(qnan ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan ninf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -738,14 +738,14 @@ define float @returned_fabs_noqnan(float nofpclass(qnan) %x) {
 define float @returned_fabs_nonan(float nofpclass(nan) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @returned_fabs_nonan
-; TUNIT-SAME: (float nofpclass(nan) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan ninf nzero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @returned_fabs_nonan
-; CGSCC-SAME: (float nofpclass(nan) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan ninf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -755,14 +755,14 @@ define float @returned_fabs_nonan(float nofpclass(nan) %x) {
 define float @returned_fabs_noinf(float nofpclass(inf) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(inf nzero nsub nnorm) float @returned_fabs_noinf
-; TUNIT-SAME: (float nofpclass(inf) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf nzero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(inf nzero nsub nnorm) float @returned_fabs_noinf
-; CGSCC-SAME: (float nofpclass(inf) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(inf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -772,14 +772,14 @@ define float @returned_fabs_noinf(float nofpclass(inf) %x) {
 define float @returned_fabs_nopos(float nofpclass(psub pnorm pinf) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopos
-; TUNIT-SAME: (float nofpclass(pinf psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf psub pnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf nzero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf nzero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopos
-; CGSCC-SAME: (float nofpclass(pinf psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf psub pnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf nzero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf nzero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -789,14 +789,14 @@ define float @returned_fabs_nopos(float nofpclass(psub pnorm pinf) %x) {
 define float @returned_fabs_nopos_nopzero(float nofpclass(psub pnorm pinf pzero) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopos_nopzero
-; TUNIT-SAME: (float nofpclass(pinf pzero psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf pzero psub pnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopos_nopzero
-; CGSCC-SAME: (float nofpclass(pinf pzero psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf pzero psub pnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -806,14 +806,14 @@ define float @returned_fabs_nopos_nopzero(float nofpclass(psub pnorm pinf pzero)
 define float @returned_fabs_nopos_nozero(float nofpclass(psub pnorm pinf zero) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf zero nsub nnorm) float @returned_fabs_nopos_nozero
-; TUNIT-SAME: (float nofpclass(pinf zero psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf zero psub pnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf zero nsub nnorm) float @returned_fabs_nopos_nozero
-; CGSCC-SAME: (float nofpclass(pinf zero psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf zero psub pnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -823,14 +823,14 @@ define float @returned_fabs_nopos_nozero(float nofpclass(psub pnorm pinf zero) %
 define float @returned_fabs_nopos_nonan(float nofpclass(psub pnorm pinf nan) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @returned_fabs_nopos_nonan
-; TUNIT-SAME: (float nofpclass(nan pinf psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan pinf psub pnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(nan inf nzero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan inf nzero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @returned_fabs_nopos_nonan
-; CGSCC-SAME: (float nofpclass(nan pinf psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan pinf psub pnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(nan inf nzero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan inf nzero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -840,14 +840,14 @@ define float @returned_fabs_nopos_nonan(float nofpclass(psub pnorm pinf nan) %x)
 define float @returned_fabs_noneg(float nofpclass(nsub nnorm ninf) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_noneg
-; TUNIT-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nsub nnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_noneg
-; CGSCC-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nsub nnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -858,13 +858,13 @@ define float @returned_fabs_noneg_nonzero(float nofpclass(nsub nnorm ninf nzero)
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_noneg_nonzero
 ; TUNIT-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR20]]
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_noneg_nonzero
 ; CGSCC-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -875,13 +875,13 @@ define float @returned_fabs_noneg_nozero(float nofpclass(nsub nnorm ninf zero) %
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf zero nsub nnorm) float @returned_fabs_noneg_nozero
 ; TUNIT-SAME: (float nofpclass(ninf zero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf zero nsub nnorm) [[X]]) #[[ATTR20]]
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf zero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf zero nsub nnorm) float @returned_fabs_noneg_nozero
 ; CGSCC-SAME: (float nofpclass(ninf zero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf zero nsub nnorm) [[X]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf zero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf zero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -891,14 +891,14 @@ define float @returned_fabs_noneg_nozero(float nofpclass(nsub nnorm ninf zero) %
 define float @returned_fabs_noneg_nonan(float nofpclass(nsub nnorm ninf nan) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @returned_fabs_noneg_nonan
-; TUNIT-SAME: (float nofpclass(nan ninf nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan ninf nsub nnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan ninf nzero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @returned_fabs_noneg_nonan
-; CGSCC-SAME: (float nofpclass(nan ninf nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan ninf nsub nnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan ninf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -908,14 +908,14 @@ define float @returned_fabs_noneg_nonan(float nofpclass(nsub nnorm ninf nan) %x)
 define float @returned_fabs_nonsub_nopnorm_nonzero(float nofpclass(nsub pnorm nzero) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nonsub_nopnorm_nonzero
-; TUNIT-SAME: (float nofpclass(nzero nsub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nzero nsub pnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(ninf nzero nsub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nonsub_nopnorm_nonzero
-; CGSCC-SAME: (float nofpclass(nzero nsub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nzero nsub pnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(ninf nzero nsub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -925,14 +925,14 @@ define float @returned_fabs_nonsub_nopnorm_nonzero(float nofpclass(nsub pnorm nz
 define float @returned_fabs_nopsub_nonnorm_nopzero(float nofpclass(psub nnorm pzero) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopsub_nonnorm_nopzero
-; TUNIT-SAME: (float nofpclass(pzero psub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pzero psub nnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(ninf zero sub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf zero sub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nopsub_nonnorm_nopzero
-; CGSCC-SAME: (float nofpclass(pzero psub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pzero psub nnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(ninf zero sub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf zero sub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -942,14 +942,14 @@ define float @returned_fabs_nopsub_nonnorm_nopzero(float nofpclass(psub nnorm pz
 define float @returned_fabs_nonnorm_nozero(float nofpclass(nnorm nzero) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nonnorm_nozero
-; TUNIT-SAME: (float nofpclass(nzero nnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nzero nnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fabs_nonnorm_nozero
-; CGSCC-SAME: (float nofpclass(nzero nnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nzero nnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FABS]]
 ;
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -992,7 +992,7 @@ define float @returned_fneg_noqnan(float nofpclass(qnan) %x) {
 define float @returned_fneg_nosnan_ninf_flag(float nofpclass(snan) %x) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(snan inf) float @returned_fneg_nosnan_ninf_flag
-; CHECK-SAME: (float nofpclass(snan) [[X:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(snan inf) [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg ninf float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -1025,7 +1025,7 @@ define float @returned_fneg_noinf(float nofpclass(inf) %x) {
 define float @returned_fneg_noneg(float nofpclass(ninf nsub nnorm nzero) %x) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @returned_fneg_noneg
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -1036,7 +1036,7 @@ define float @returned_fneg_noneg(float nofpclass(ninf nsub nnorm nzero) %x) {
 define float @returned_fneg_noneg_nnan_flag(float nofpclass(ninf nsub nnorm nzero) %x) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @returned_fneg_noneg_nnan_flag
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(all) [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg nnan float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -1047,7 +1047,7 @@ define float @returned_fneg_noneg_nnan_flag(float nofpclass(ninf nsub nnorm nzer
 define float @returned_fneg_nonsubnnorm(float nofpclass(nsub nnorm) %x) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(psub pnorm) float @returned_fneg_nonsubnnorm
-; CHECK-SAME: (float nofpclass(nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -1058,7 +1058,7 @@ define float @returned_fneg_nonsubnnorm(float nofpclass(nsub nnorm) %x) {
 define float @returned_fneg_nopos(float nofpclass(pinf psub pnorm pzero) %x) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @returned_fneg_nopos
-; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[X:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -1069,7 +1069,7 @@ define float @returned_fneg_nopos(float nofpclass(pinf psub pnorm pzero) %x) {
 define float @returned_fneg_nopnormpsub(float nofpclass(psub pnorm) %x) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nsub nnorm) float @returned_fneg_nopnormpsub
-; CHECK-SAME: (float nofpclass(psub pnorm) [[X:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(sub norm) [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -1080,7 +1080,7 @@ define float @returned_fneg_nopnormpsub(float nofpclass(psub pnorm) %x) {
 define float @returned_fneg_mixed(float nofpclass(psub nnorm nzero qnan ninf) %x) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(qnan pinf pzero nsub pnorm) float @returned_fneg_mixed
-; CHECK-SAME: (float nofpclass(qnan ninf nzero psub nnorm) [[X:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(qnan inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
@@ -1091,15 +1091,15 @@ define float @returned_fneg_mixed(float nofpclass(psub nnorm nzero qnan ninf) %x
 define float @returned_fneg_fabs(float %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(pinf pzero psub pnorm) float @returned_fneg_fabs
-; TUNIT-SAME: (float [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(pinf pzero psub pnorm) float @returned_fneg_fabs
-; CGSCC-SAME: (float [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1111,15 +1111,15 @@ define float @returned_fneg_fabs(float %x) {
 define float @returned_fneg_fabs_nosnan(float nofpclass(snan) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(snan pinf pzero psub pnorm) float @returned_fneg_fabs_nosnan
-; TUNIT-SAME: (float nofpclass(snan) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(snan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(snan) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(snan inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(snan inf zero sub norm) float @llvm.fabs.f32(float nofpclass(snan inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(snan pinf pzero psub pnorm) float @returned_fneg_fabs_nosnan
-; CGSCC-SAME: (float nofpclass(snan) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(snan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(snan) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(snan inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(snan inf zero sub norm) float @llvm.fabs.f32(float nofpclass(snan inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1131,15 +1131,15 @@ define float @returned_fneg_fabs_nosnan(float nofpclass(snan) %x) {
 define float @returned_fneg_fabs_noqnan(float nofpclass(qnan) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(qnan pinf pzero psub pnorm) float @returned_fneg_fabs_noqnan
-; TUNIT-SAME: (float nofpclass(qnan) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(qnan inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(qnan inf zero sub norm) float @llvm.fabs.f32(float nofpclass(qnan inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(qnan pinf pzero psub pnorm) float @returned_fneg_fabs_noqnan
-; CGSCC-SAME: (float nofpclass(qnan) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(qnan inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(qnan inf zero sub norm) float @llvm.fabs.f32(float nofpclass(qnan inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1151,15 +1151,15 @@ define float @returned_fneg_fabs_noqnan(float nofpclass(qnan) %x) {
 define float @returned_fneg_fabs_nonan(float nofpclass(nan) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @returned_fneg_fabs_nonan
-; TUNIT-SAME: (float nofpclass(nan) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(all) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float nofpclass(all) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @returned_fneg_fabs_nonan
-; CGSCC-SAME: (float nofpclass(nan) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(nan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(nan) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(all) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float nofpclass(all) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1171,15 +1171,15 @@ define float @returned_fneg_fabs_nonan(float nofpclass(nan) %x) {
 define float @returned_fneg_fabs_noneg(float nofpclass(ninf nsub nnorm nzero) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(pinf pzero psub pnorm) float @returned_fneg_fabs_noneg
-; TUNIT-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(pinf pzero psub pnorm) float @returned_fneg_fabs_noneg
-; CGSCC-SAME: (float nofpclass(ninf nzero nsub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(ninf nzero nsub nnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1191,15 +1191,15 @@ define float @returned_fneg_fabs_noneg(float nofpclass(ninf nsub nnorm nzero) %x
 define float @returned_fneg_fabs_nopos(float nofpclass(pinf psub pnorm pzero) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(pinf pzero psub pnorm) float @returned_fneg_fabs_nopos
-; TUNIT-SAME: (float nofpclass(pinf pzero psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf pzero psub pnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(pinf pzero psub pnorm) float @returned_fneg_fabs_nopos
-; CGSCC-SAME: (float nofpclass(pinf pzero psub pnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(pinf pzero psub pnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1211,15 +1211,15 @@ define float @returned_fneg_fabs_nopos(float nofpclass(pinf psub pnorm pzero) %x
 define float @returned_fneg_fabs_mixed(float nofpclass(psub nnorm nzero qnan ninf) %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(qnan pinf pzero psub pnorm) float @returned_fneg_fabs_mixed
-; TUNIT-SAME: (float nofpclass(qnan ninf nzero psub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan ninf nzero psub nnorm) [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(qnan inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(qnan inf zero sub norm) float @llvm.fabs.f32(float nofpclass(qnan inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(qnan pinf pzero psub pnorm) float @returned_fneg_fabs_mixed
-; CGSCC-SAME: (float nofpclass(qnan ninf nzero psub nnorm) [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(qnan ninf nzero nsub nnorm) float @llvm.fabs.f32(float nofpclass(qnan ninf nzero psub nnorm) [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(qnan inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(qnan inf zero sub norm) float @llvm.fabs.f32(float nofpclass(qnan inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1231,15 +1231,15 @@ define float @returned_fneg_fabs_mixed(float nofpclass(psub nnorm nzero qnan nin
 define float @returned_fneg_fabs_ninf_flag_fabs(float %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(inf pzero psub pnorm) float @returned_fneg_fabs_ninf_flag_fabs
-; TUNIT-SAME: (float [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call ninf nofpclass(inf nzero nsub nnorm) float @llvm.fabs.f32(float [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call ninf nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(inf pzero psub pnorm) float @returned_fneg_fabs_ninf_flag_fabs
-; CGSCC-SAME: (float [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call ninf nofpclass(inf nzero nsub nnorm) float @llvm.fabs.f32(float [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call ninf nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1251,15 +1251,15 @@ define float @returned_fneg_fabs_ninf_flag_fabs(float %x) {
 define float @returned_fneg_fabs_ninf_flag_fneg(float %x) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define nofpclass(inf pzero psub pnorm) float @returned_fneg_fabs_ninf_flag_fneg
-; TUNIT-SAME: (float [[X:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[X]]) #[[ATTR20]]
+; TUNIT-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR22]]
 ; TUNIT-NEXT:    [[FNEG_FABS:%.*]] = fneg ninf float [[FABS]]
 ; TUNIT-NEXT:    ret float [[FNEG_FABS]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define nofpclass(inf pzero psub pnorm) float @returned_fneg_fabs_ninf_flag_fneg
-; CGSCC-SAME: (float [[X:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float [[X]]) #[[ATTR17]]
+; CGSCC-SAME: (float nofpclass(inf zero sub norm) [[X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:    [[FABS:%.*]] = call nofpclass(inf zero sub norm) float @llvm.fabs.f32(float nofpclass(inf zero sub norm) [[X]]) #[[ATTR19]]
 ; CGSCC-NEXT:    [[FNEG_FABS:%.*]] = fneg ninf float [[FABS]]
 ; CGSCC-NEXT:    ret float [[FNEG_FABS]]
 ;
@@ -1361,9 +1361,9 @@ define float @assume_intersection_not_zero_and_not_nan(float %arg) {
 ; CHECK-SAME: (float returned nofpclass(nan zero) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IS_NOT_ZERO_OR_NAN:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NOT_ZERO_OR_NAN]]) #[[ATTR16]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NOT_ZERO_OR_NAN]]) #[[ATTR18]]
 ; CHECK-NEXT:    [[IS_ORD:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_ORD]]) #[[ATTR16]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_ORD]]) #[[ATTR18]]
 ; CHECK-NEXT:    call void @extern.use(float nofpclass(nan zero) [[ARG]])
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -1380,10 +1380,10 @@ define float @assume_intersection_class(float %arg) {
 ; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @assume_intersection_class
 ; CHECK-SAME: (float returned nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[POS_NORMAL_OR_POS_SUBNORMAL:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(nan inf zero sub nnorm) [[ARG]], i32 noundef 384) #[[ATTR18]]
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[POS_NORMAL_OR_POS_SUBNORMAL]]) #[[ATTR16]]
-; CHECK-NEXT:    [[IS_NORMAL:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(nan inf zero sub nnorm) [[ARG]], i32 noundef 264) #[[ATTR18]]
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NORMAL]]) #[[ATTR16]]
+; CHECK-NEXT:    [[POS_NORMAL_OR_POS_SUBNORMAL:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(nan inf zero sub nnorm) [[ARG]], i32 noundef 384) #[[ATTR20]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[POS_NORMAL_OR_POS_SUBNORMAL]]) #[[ATTR18]]
+; CHECK-NEXT:    [[IS_NORMAL:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(nan inf zero sub nnorm) [[ARG]], i32 noundef 264) #[[ATTR20]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_NORMAL]]) #[[ATTR18]]
 ; CHECK-NEXT:    call void @extern.use(float nofpclass(nan inf zero sub nnorm) [[ARG]])
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -1401,10 +1401,10 @@ define float @assume_intersection_none(float %arg) {
 ; CHECK-LABEL: define nofpclass(all) float @assume_intersection_none
 ; CHECK-SAME: (float returned nofpclass(all) [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CLASS1:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(all) [[ARG]], i32 noundef 682) #[[ATTR18]]
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[CLASS1]]) #[[ATTR16]]
-; CHECK-NEXT:    [[CLASS2:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(all) [[ARG]], i32 noundef 341) #[[ATTR18]]
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[CLASS2]]) #[[ATTR16]]
+; CHECK-NEXT:    [[CLASS1:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(all) [[ARG]], i32 noundef 682) #[[ATTR20]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[CLASS1]]) #[[ATTR18]]
+; CHECK-NEXT:    [[CLASS2:%.*]] = call i1 @llvm.is.fpclass.f32(float nofpclass(all) [[ARG]], i32 noundef 341) #[[ATTR20]]
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[CLASS2]]) #[[ATTR18]]
 ; CHECK-NEXT:    call void @extern.use(float nofpclass(all) [[ARG]])
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -1544,7 +1544,7 @@ define <4 x float> @insertelement_unknown_base(<4 x float> %arg0) {
 define float @insertelement_extractelement_same(<4 x float> %arg0) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nan inf nzero sub norm) float @insertelement_extractelement_same
-; CHECK-SAME: (<4 x float> [[ARG0:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (<4 x float> nofpclass(nan inf nzero sub norm) [[ARG0:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[INSERT:%.*]] = insertelement <4 x float> [[ARG0]], float 0.000000e+00, i32 1
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[INSERT]], i32 1
 ; CHECK-NEXT:    ret float [[EXTRACT]]
@@ -1678,7 +1678,7 @@ define <4 x float> @shufflevector_unknown_all(<2 x float> %arg0, <2 x float> %ar
 define <4 x float> @shufflevector_only_demand_lhs(<2 x float> nofpclass(inf) %arg0, <2 x float> %arg1) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(inf) <4 x float> @shufflevector_only_demand_lhs
-; CHECK-SAME: (<2 x float> nofpclass(inf) [[ARG0:%.*]], <2 x float> [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (<2 x float> nofpclass(inf) [[ARG0:%.*]], <2 x float> nofpclass(inf) [[ARG1:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[ARG0]], <2 x float> [[ARG1]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
 ;
@@ -1689,7 +1689,7 @@ define <4 x float> @shufflevector_only_demand_lhs(<2 x float> nofpclass(inf) %ar
 define <4 x float> @shufflevector_only_demand_rhs(<2 x float> %arg0, <2 x float> nofpclass(inf) %arg1) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(inf) <4 x float> @shufflevector_only_demand_rhs
-; CHECK-SAME: (<2 x float> [[ARG0:%.*]], <2 x float> nofpclass(inf) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (<2 x float> nofpclass(inf) [[ARG0:%.*]], <2 x float> nofpclass(inf) [[ARG1:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[ARG0]], <2 x float> [[ARG1]], <4 x i32> <i32 2, i32 3, i32 3, i32 2>
 ; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
 ;
@@ -1735,7 +1735,7 @@ define float @shufflevector_extractelt0(<2 x float> %arg0, <2 x float> nofpclass
 define float @shufflevector_extractelt1(<2 x float> %arg0, <2 x float> nofpclass(inf) %arg1) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(inf) float @shufflevector_extractelt1
-; CHECK-SAME: (<2 x float> [[ARG0:%.*]], <2 x float> nofpclass(inf) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (<2 x float> nofpclass(inf) [[ARG0:%.*]], <2 x float> nofpclass(inf) [[ARG1:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[ARG0]], <2 x float> [[ARG1]], <4 x i32> <i32 1, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    ret float [[EXTRACT]]
@@ -1771,11 +1771,106 @@ define float @shufflevector_extractelt3(<2 x float> %arg0, <2 x float> nofpclass
   ret float %extract
 }
 
+define i32 @fptosi(float nofpclass(inf nan) %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define i32 @fptosi
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[FPTOSI:%.*]] = fptosi float [[ARG]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[FPTOSI]], 1
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %fptosi = fptosi float %arg to i32
+  %add = add i32 %fptosi, 1
+  ret i32 %add
+}
+
+define float @fptrunc(double nofpclass(inf nan) %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fptrunc
+; CHECK-SAME: (double nofpclass(nan inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CAST:%.*]] = fptrunc double [[ARG]] to float
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CAST]], [[CAST]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  %cast = fptrunc double %arg to float
+  %mul = fmul float %cast, %cast
+  ret float %mul
+}
+
+define double @fpext(float nofpclass(inf nan) %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) double @fpext
+; CHECK-SAME: (float nofpclass(nan inf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CAST:%.*]] = fpext float [[ARG]] to double
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[CAST]], [[CAST]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %cast = fpext float %arg to double
+  %mul = fmul double %cast, %cast
+  ret double %mul
+}
+
+define float @atomicrmw_fadd(ptr %ptr, float nofpclass(inf nan) %val) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CHECK-LABEL: define float @atomicrmw_fadd
+; CHECK-SAME: (ptr nocapture nofree noundef nonnull dereferenceable(4) [[PTR:%.*]], float nofpclass(nan inf) [[VAL:%.*]]) #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VAL]] seq_cst, align 4
+; CHECK-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr %ptr, float %val seq_cst
+  ret float %result
+}
+
+define float @load(ptr %ptr, float nofpclass(nan inf) %val) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @load
+; CHECK-SAME: (ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], float nofpclass(nan inf) [[VAL:%.*]]) #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT:    store float [[VAL]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[LOAD]], [[LOAD]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  store float %val, ptr %ptr
+  %load = load float, ptr %ptr
+  %mul = fmul float %load, %load
+  ret float %mul
+}
+
+define float @load_atomic(ptr %ptr, float nofpclass(nan inf) %val) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @load_atomic
+; CHECK-SAME: (ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], float nofpclass(nan inf) [[VAL:%.*]]) #[[ATTR6]] {
+; CHECK-NEXT:    store atomic float [[VAL]], ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load atomic float, ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[LOAD]], [[LOAD]]
+; CHECK-NEXT:    ret float [[MUL]]
+;
+  store atomic float %val, ptr %ptr seq_cst, align 4
+  %load = load atomic float, ptr %ptr seq_cst, align 4
+  %mul = fmul float %load, %load
+  ret float %mul
+}
+
+define <8 x float> @shufflevector_shufflevector(<4 x float> nofpclass(inf nan) %arg0, <4 x float> nofpclass(inf nan zero) %arg1) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan inf) <8 x float> @shufflevector_shufflevector
+; CHECK-SAME: (<4 x float> nofpclass(nan inf) [[ARG0:%.*]], <4 x float> nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHUFFLE0:%.*]] = shufflevector <4 x float> [[ARG0]], <4 x float> [[ARG0]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[ARG1]], <4 x float> [[ARG1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x float> [[SHUFFLE0]], <4 x float> [[SHUFFLE1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[SHUFFLE2]]
+;
+  %shuffle0 = shufflevector <4 x float> %arg0, <4 x float> %arg0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %shuffle1 = shufflevector <4 x float> %arg1, <4 x float> %arg1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle2 = shufflevector <4 x float> %shuffle0, <4 x float> %shuffle1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle2
+}
+
 define float @constrained_sitofp(i32 %arg) strictfp {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind strictfp willreturn memory(inaccessiblemem: readwrite)
 ; CHECK-LABEL: define nofpclass(nan nzero sub) float @constrained_sitofp
-; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR6:[0-9]+]] {
-; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(nan nzero sub) float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR18]]
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(nan nzero sub) float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR20]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
   %val = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -1785,8 +1880,8 @@ define float @constrained_sitofp(i32 %arg) strictfp {
 define float @constrained_uitofp(i32 %arg) strictfp {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind strictfp willreturn memory(inaccessiblemem: readwrite)
 ; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @constrained_uitofp
-; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR6]] {
-; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR18]]
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR8]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call nofpclass(nan ninf nzero sub nnorm) float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[ARG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR20]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
   %val = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -1796,7 +1891,7 @@ define float @constrained_uitofp(i32 %arg) strictfp {
 define float @fadd_p0(float %arg0) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_p0
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], 0.000000e+00
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -1829,7 +1924,7 @@ define float @fsub_p0(float %arg0) {
 define float @fsub_n0(float %arg0) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fsub_n0
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[ARG0]], -0.000000e+00
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1840,7 +1935,7 @@ define float @fsub_n0(float %arg0) {
 define float @fsub_p0_commute(float %arg0) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fsub_p0_commute
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float 0.000000e+00, [[ARG0]]
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1862,7 +1957,7 @@ define float @fsub_n0_commute(float %arg0) {
 define float @fadd_p0_ftz_daz(float %arg0) #3 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_p0_ftz_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR9:[0-9]+]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], 0.000000e+00
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -1873,7 +1968,7 @@ define float @fadd_p0_ftz_daz(float %arg0) #3 {
 define float @fadd_n0_ftz_daz(float %arg0) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fadd_n0_ftz_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR8:[0-9]+]] {
+; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], -0.000000e+00
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -1884,7 +1979,7 @@ define float @fadd_n0_ftz_daz(float %arg0) #0 {
 define float @fsub_p0_ftz_daz(float %arg0) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fsub_p0_ftz_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR8]] {
+; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[ARG0]], 0.000000e+00
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1895,7 +1990,7 @@ define float @fsub_p0_ftz_daz(float %arg0) #0 {
 define float @fsub_n0_ftz_daz(float %arg0) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fsub_n0_ftz_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR8]] {
+; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[ARG0]], -0.000000e+00
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1906,7 +2001,7 @@ define float @fsub_n0_ftz_daz(float %arg0) #0 {
 define float @fsub_p0_commute_ftz_daz(float %arg0) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fsub_p0_commute_ftz_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR8]] {
+; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float 0.000000e+00, [[ARG0]]
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1917,7 +2012,7 @@ define float @fsub_p0_commute_ftz_daz(float %arg0) #0 {
 define float @fsub_n0_commute_ftz_daz(float %arg0) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fsub_n0_commute_ftz_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR8]] {
+; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float -0.000000e+00, [[ARG0]]
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1928,7 +2023,7 @@ define float @fsub_n0_commute_ftz_daz(float %arg0) #0 {
 define float @fadd_p0_ieee_daz(float %arg0) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_p0_ieee_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR9:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR11:[0-9]+]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], 0.000000e+00
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -1939,7 +2034,7 @@ define float @fadd_p0_ieee_daz(float %arg0) #2 {
 define float @fadd_p0_dapz_ieee(float %arg0) #4 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_p0_dapz_ieee
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR10:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR12:[0-9]+]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], 0.000000e+00
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -1950,7 +2045,7 @@ define float @fadd_p0_dapz_ieee(float %arg0) #4 {
 define float @fadd_n0_ieee_daz(float %arg0) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fadd_n0_ieee_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], -0.000000e+00
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -1961,7 +2056,7 @@ define float @fadd_n0_ieee_daz(float %arg0) #2 {
 define float @fsub_p0_ieee_daz(float %arg0) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fsub_p0_ieee_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[ARG0]], 0.000000e+00
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1972,7 +2067,7 @@ define float @fsub_p0_ieee_daz(float %arg0) #2 {
 define float @fsub_n0_ieee_daz(float %arg0) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fsub_n0_ieee_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[ARG0]], -0.000000e+00
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1983,7 +2078,7 @@ define float @fsub_n0_ieee_daz(float %arg0) #2 {
 define float @fsub_p0_commute_ieee_daz(float %arg0) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fsub_p0_commute_ieee_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float 0.000000e+00, [[ARG0]]
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -1994,7 +2089,7 @@ define float @fsub_p0_commute_ieee_daz(float %arg0) #2 {
 define float @fsub_n0_commute_ieee_daz(float %arg0) #1 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fsub_n0_commute_ieee_daz
-; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR11:[0-9]+]] {
+; CHECK-SAME: (float [[ARG0:%.*]]) #[[ATTR13:[0-9]+]] {
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float -0.000000e+00, [[ARG0]]
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
@@ -2016,7 +2111,7 @@ define float @fadd_never_negzero_or_negsub(float nofpclass(nzero nsub) %a, float
 define float @fadd_never_negzero_or_negsub_daz(float nofpclass(nzero nsub) %a, float nofpclass(nzero nsub) %b) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_never_negzero_or_negsub_daz
-; CHECK-SAME: (float nofpclass(nzero nsub) [[A:%.*]], float nofpclass(nzero nsub) [[B:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(nzero nsub) [[A:%.*]], float nofpclass(nzero nsub) [[B:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[A]], [[B]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2027,7 +2122,7 @@ define float @fadd_never_negzero_or_negsub_daz(float nofpclass(nzero nsub) %a, f
 define float @fadd_never_negzero_or_negsub_dapz(float nofpclass(nzero nsub) %a, float nofpclass(nzero nsub) %b) #5 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_never_negzero_or_negsub_dapz
-; CHECK-SAME: (float nofpclass(nzero nsub) [[A:%.*]], float nofpclass(nzero nsub) [[B:%.*]]) #[[ATTR12:[0-9]+]] {
+; CHECK-SAME: (float nofpclass(nzero nsub) [[A:%.*]], float nofpclass(nzero nsub) [[B:%.*]]) #[[ATTR14:[0-9]+]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[A]], [[B]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2049,7 +2144,7 @@ define float @fadd_never_negzero_or_possub(float nofpclass(nzero psub) %a, float
 define float @fadd_never_negzero_or_possub_daz(float nofpclass(nzero psub) %a, float nofpclass(nzero psub) %b) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fadd_never_negzero_or_possub_daz
-; CHECK-SAME: (float nofpclass(nzero psub) [[A:%.*]], float nofpclass(nzero psub) [[B:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(nzero psub) [[A:%.*]], float nofpclass(nzero psub) [[B:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[A]], [[B]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2060,7 +2155,7 @@ define float @fadd_never_negzero_or_possub_daz(float nofpclass(nzero psub) %a, f
 define float @fadd_never_negzero_or_possub_dapz(float nofpclass(nzero psub) %a, float nofpclass(nzero psub) %b) #5 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_never_negzero_or_possub_dapz
-; CHECK-SAME: (float nofpclass(nzero psub) [[A:%.*]], float nofpclass(nzero psub) [[B:%.*]]) #[[ATTR12]] {
+; CHECK-SAME: (float nofpclass(nzero psub) [[A:%.*]], float nofpclass(nzero psub) [[B:%.*]]) #[[ATTR14]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[A]], [[B]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2071,7 +2166,7 @@ define float @fadd_never_negzero_or_possub_dapz(float nofpclass(nzero psub) %a,
 define float @fadd_never_negzero_or_sub_daz(float nofpclass(nzero sub) %a, float nofpclass(nzero sub) %b) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_never_negzero_or_sub_daz
-; CHECK-SAME: (float nofpclass(nzero sub) [[A:%.*]], float nofpclass(nzero sub) [[B:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[A:%.*]], float nofpclass(nzero sub) [[B:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[A]], [[B]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2082,7 +2177,7 @@ define float @fadd_never_negzero_or_sub_daz(float nofpclass(nzero sub) %a, float
 define float @fadd_never_negzero_or_sub_dapz(float nofpclass(nzero sub) %a, float nofpclass(nzero sub) %b) #5 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_never_negzero_or_sub_dapz
-; CHECK-SAME: (float nofpclass(nzero sub) [[A:%.*]], float nofpclass(nzero sub) [[B:%.*]]) #[[ATTR12]] {
+; CHECK-SAME: (float nofpclass(nzero sub) [[A:%.*]], float nofpclass(nzero sub) [[B:%.*]]) #[[ATTR14]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[A]], [[B]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2126,7 +2221,7 @@ define float @fadd_known_positive(float nofpclass(ninf nsub nnorm) %arg0, float
 define float @fadd_known_positive_daz(float nofpclass(ninf nsub nnorm) %arg0, float nofpclass(ninf nsub nnorm) %arg1) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @fadd_known_positive_daz
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR8]] {
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2137,7 +2232,7 @@ define float @fadd_known_positive_daz(float nofpclass(ninf nsub nnorm) %arg0, fl
 define float @fadd_known_positive_nzero_lhs(float nofpclass(ninf nsub nnorm nzero) %arg0, float nofpclass(ninf nsub nnorm) %arg1) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fadd_known_positive_nzero_lhs
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2148,7 +2243,7 @@ define float @fadd_known_positive_nzero_lhs(float nofpclass(ninf nsub nnorm nzer
 define float @fadd_known_positive_nzero_rhs(float nofpclass(ninf nsub nnorm) %arg0, float nofpclass(ninf nsub nnorm nzero) %arg1) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fadd_known_positive_nzero_rhs
-; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2170,7 +2265,7 @@ define float @fadd_known_positive_nzero(float nofpclass(ninf nsub nnorm nzero) %
 define float @fadd_known_positive_nzero_ftz_daz(float nofpclass(ninf nsub nnorm nzero) %arg0, float nofpclass(ninf nsub nnorm nzero) %arg1) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @fadd_known_positive_nzero_ftz_daz
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR8]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2181,7 +2276,7 @@ define float @fadd_known_positive_nzero_ftz_daz(float nofpclass(ninf nsub nnorm
 define float @fadd_known_positive_nzero_ftz(float nofpclass(ninf nsub nnorm nzero) %arg0, float nofpclass(ninf nsub nnorm nzero) %arg1) #1 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @fadd_known_positive_nzero_ftz
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR11]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR13]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2192,7 +2287,7 @@ define float @fadd_known_positive_nzero_ftz(float nofpclass(ninf nsub nnorm nzer
 define float @fadd_known_positive_nzero_daz(float nofpclass(ninf nsub nnorm nzero) %arg0, float nofpclass(ninf nsub nnorm nzero) %arg1) #2 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fadd_known_positive_nzero_daz
-; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR9]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2214,7 +2309,7 @@ define float @fadd_known_positive_normal(float nofpclass(ninf nnorm nzero) %arg0
 define float @fadd_known_positive_normal_daz(float nofpclass(ninf nnorm nzero) %arg0, float nofpclass(ninf nnorm nzero) %arg1) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fadd_known_positive_normal_daz
-; CHECK-SAME: (float nofpclass(ninf nzero nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nnorm) [[ARG1:%.*]]) #[[ATTR8]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nnorm) [[ARG1:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2225,7 +2320,7 @@ define float @fadd_known_positive_normal_daz(float nofpclass(ninf nnorm nzero) %
 define float @fadd_known_positive_normal_except0_daz(float nofpclass(ninf nnorm) %arg0, float nofpclass(ninf nnorm) %arg1) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define float @fadd_known_positive_normal_except0_daz
-; CHECK-SAME: (float nofpclass(ninf nnorm) [[ARG0:%.*]], float nofpclass(ninf nnorm) [[ARG1:%.*]]) #[[ATTR8]] {
+; CHECK-SAME: (float nofpclass(ninf nnorm) [[ARG0:%.*]], float nofpclass(ninf nnorm) [[ARG1:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2236,7 +2331,7 @@ define float @fadd_known_positive_normal_except0_daz(float nofpclass(ninf nnorm)
 define float @fadd_known_positive_normal_dapz(float nofpclass(ninf nnorm nzero) %arg0, float nofpclass(ninf nnorm nzero) %arg1) #3 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define nofpclass(nzero) float @fadd_known_positive_normal_dapz
-; CHECK-SAME: (float nofpclass(ninf nzero nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nnorm) [[ARG1:%.*]]) #[[ATTR7]] {
+; CHECK-SAME: (float nofpclass(ninf nzero nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nnorm) [[ARG1:%.*]]) #[[ATTR9]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[ADD]]
 ;
@@ -2248,13 +2343,13 @@ define internal float @returns_fence(float %arg) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define internal float @returns_fence
 ; TUNIT-SAME: (float nofpclass(nan inf) [[ARG:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[RET:%.*]] = call float @llvm.arithmetic.fence.f32(float [[ARG]]) #[[ATTR20]]
+; TUNIT-NEXT:    [[RET:%.*]] = call float @llvm.arithmetic.fence.f32(float [[ARG]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define internal float @returns_fence
 ; CGSCC-SAME: (float nofpclass(nan inf) [[ARG:%.*]]) #[[ATTR3]] {
-; CGSCC-NEXT:    [[RET:%.*]] = call float @llvm.arithmetic.fence.f32(float nofpclass(nan inf) [[ARG]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[RET:%.*]] = call float @llvm.arithmetic.fence.f32(float nofpclass(nan inf) [[ARG]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[RET]]
 ;
   %ret = call float @llvm.arithmetic.fence.f32(float %arg)
@@ -2266,15 +2361,15 @@ define internal float @refine_callsite_attribute(float nofpclass(inf) %arg) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define internal float @refine_callsite_attribute
 ; TUNIT-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FUNC0:%.*]] = call float @returns_fence(float nofpclass(nan inf) [[ARG]]) #[[ATTR17]]
-; TUNIT-NEXT:    [[RET:%.*]] = call float @llvm.arithmetic.fence.f32(float [[FUNC0]]) #[[ATTR20]]
+; TUNIT-NEXT:    [[FUNC0:%.*]] = call float @returns_fence(float nofpclass(nan inf) [[ARG]]) #[[ATTR19]]
+; TUNIT-NEXT:    [[RET:%.*]] = call float @llvm.arithmetic.fence.f32(float [[FUNC0]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define internal float @refine_callsite_attribute
 ; CGSCC-SAME: (float nofpclass(inf) [[ARG:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[FUNC0:%.*]] = call float @returns_fence(float nofpclass(nan inf) [[ARG]]) #[[ATTR17]]
-; CGSCC-NEXT:    [[RET:%.*]] = call float @llvm.arithmetic.fence.f32(float [[FUNC0]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[FUNC0:%.*]] = call float @returns_fence(float nofpclass(nan inf) [[ARG]]) #[[ATTR19]]
+; CGSCC-NEXT:    [[RET:%.*]] = call float @llvm.arithmetic.fence.f32(float [[FUNC0]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[RET]]
 ;
   %func0 = call float @returns_fence(float nofpclass(nan) %arg)
@@ -2286,13 +2381,13 @@ define float @user(float %arg) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define float @user
 ; TUNIT-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; TUNIT-NEXT:    [[FUNC1:%.*]] = call float @refine_callsite_attribute(float nofpclass(inf) [[ARG]]) #[[ATTR17]]
+; TUNIT-NEXT:    [[FUNC1:%.*]] = call float @refine_callsite_attribute(float nofpclass(inf) [[ARG]]) #[[ATTR19]]
 ; TUNIT-NEXT:    ret float [[FUNC1]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define float @user
 ; CGSCC-SAME: (float [[ARG:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[FUNC1:%.*]] = call float @refine_callsite_attribute(float nofpclass(inf) [[ARG]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[FUNC1:%.*]] = call float @refine_callsite_attribute(float nofpclass(inf) [[ARG]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[FUNC1]]
 ;
   %func1 = call float @refine_callsite_attribute(float %arg)
@@ -2321,7 +2416,7 @@ define internal float @through_memory1(ptr %ptr.arg) {
 ; TUNIT-NEXT:    [[PTR_ARG_PRIV:%.*]] = alloca float, align 4
 ; TUNIT-NEXT:    store float [[TMP0]], ptr [[PTR_ARG_PRIV]], align 4
 ; TUNIT-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4
-; TUNIT-NEXT:    [[CALL:%.*]] = call float @llvm.arithmetic.fence.f32(float [[LOAD]]) #[[ATTR20]]
+; TUNIT-NEXT:    [[CALL:%.*]] = call float @llvm.arithmetic.fence.f32(float [[LOAD]]) #[[ATTR22]]
 ; TUNIT-NEXT:    ret float [[CALL]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
@@ -2330,7 +2425,7 @@ define internal float @through_memory1(ptr %ptr.arg) {
 ; CGSCC-NEXT:    [[PTR_ARG_PRIV:%.*]] = alloca float, align 4
 ; CGSCC-NEXT:    store float [[TMP0]], ptr [[PTR_ARG_PRIV]], align 4
 ; CGSCC-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4
-; CGSCC-NEXT:    [[CALL:%.*]] = call float @llvm.arithmetic.fence.f32(float [[LOAD]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[CALL:%.*]] = call float @llvm.arithmetic.fence.f32(float [[LOAD]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[CALL]]
 ;
   %load = load float, ptr %ptr.arg
@@ -2342,7 +2437,7 @@ define internal float @through_memory1(ptr %ptr.arg) {
 define internal float @through_memory2(ptr %ptr.arg) {
 ; CHECK: Function Attrs: memory(readwrite, argmem: none)
 ; CHECK-LABEL: define internal float @through_memory2
-; CHECK-SAME: (float [[TMP0:%.*]]) #[[ATTR13:[0-9]+]] {
+; CHECK-SAME: (float [[TMP0:%.*]]) #[[ATTR15:[0-9]+]] {
 ; CHECK-NEXT:    [[PTR_ARG_PRIV:%.*]] = alloca float, align 4
 ; CHECK-NEXT:    store float [[TMP0]], ptr [[PTR_ARG_PRIV]], align 4
 ; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[PTR_ARG_PRIV]], align 4
@@ -2367,7 +2462,7 @@ define float @call_through_memory0(float nofpclass(nan) %val) {
 ; CGSCC-SAME: (float nofpclass(nan) [[VAL:%.*]]) #[[ATTR4]] {
 ; CGSCC-NEXT:    [[ALLOCA:%.*]] = alloca float, align 4
 ; CGSCC-NEXT:    store float [[VAL]], ptr [[ALLOCA]], align 4
-; CGSCC-NEXT:    [[THROUGH_MEMORY:%.*]] = call float @through_memory0(float nofpclass(nan) [[VAL]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[THROUGH_MEMORY:%.*]] = call float @through_memory0(float nofpclass(nan) [[VAL]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[THROUGH_MEMORY]]
 ;
   %alloca = alloca float
@@ -2383,7 +2478,7 @@ define float @call_through_memory1(float nofpclass(nan) %val) {
 ; TUNIT-NEXT:    [[ALLOCA:%.*]] = alloca float, align 4
 ; TUNIT-NEXT:    store float [[VAL]], ptr [[ALLOCA]], align 4
 ; TUNIT-NEXT:    [[TMP1:%.*]] = load float, ptr [[ALLOCA]], align 4
-; TUNIT-NEXT:    [[THROUGH_MEMORY:%.*]] = call float @through_memory1(float [[TMP1]]) #[[ATTR19]]
+; TUNIT-NEXT:    [[THROUGH_MEMORY:%.*]] = call float @through_memory1(float [[TMP1]]) #[[ATTR21]]
 ; TUNIT-NEXT:    ret float [[THROUGH_MEMORY]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
@@ -2391,7 +2486,7 @@ define float @call_through_memory1(float nofpclass(nan) %val) {
 ; CGSCC-SAME: (float nofpclass(nan) [[VAL:%.*]]) #[[ATTR4]] {
 ; CGSCC-NEXT:    [[ALLOCA:%.*]] = alloca float, align 4
 ; CGSCC-NEXT:    store float [[VAL]], ptr [[ALLOCA]], align 4
-; CGSCC-NEXT:    [[THROUGH_MEMORY:%.*]] = call float @through_memory1(float nofpclass(nan) [[VAL]]) #[[ATTR17]]
+; CGSCC-NEXT:    [[THROUGH_MEMORY:%.*]] = call float @through_memory1(float nofpclass(nan) [[VAL]]) #[[ATTR19]]
 ; CGSCC-NEXT:    ret float [[THROUGH_MEMORY]]
 ;
   %alloca = alloca float
@@ -2455,7 +2550,7 @@ entry:
 define internal float @pow_wrapper(float %arg, float %arg1) {
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define internal float @pow_wrapper
-; TUNIT-SAME: (float noundef nofpclass(nan inf) [[ARG:%.*]], float noundef nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR14:[0-9]+]] {
+; TUNIT-SAME: (float noundef nofpclass(nan inf) [[ARG:%.*]], float noundef nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR16:[0-9]+]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[I:%.*]] = tail call float @pow_impl(float noundef nofpclass(nan inf) [[ARG]], float noundef nofpclass(nan inf) [[ARG1]])
 ; TUNIT-NEXT:    ret float [[I]]
@@ -2475,14 +2570,14 @@ bb:
 define internal float @pow_impl(float %arg, float %arg1) {
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define internal float @pow_impl
-; TUNIT-SAME: (float noundef nofpclass(nan inf) [[ARG:%.*]], float noundef nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR14]] {
+; TUNIT-SAME: (float noundef nofpclass(nan inf) [[ARG:%.*]], float noundef nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR16]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[IMPLEMENT_POW:%.*]] = call float asm "
 ; TUNIT-NEXT:    ret float [[IMPLEMENT_POW]]
 ;
 ; CGSCC: Function Attrs: norecurse
 ; CGSCC-LABEL: define internal float @pow_impl
-; CGSCC-SAME: (float [[ARG:%.*]], float [[ARG1:%.*]]) #[[ATTR14:[0-9]+]] {
+; CGSCC-SAME: (float [[ARG:%.*]], float [[ARG1:%.*]]) #[[ATTR16:[0-9]+]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[IMPLEMENT_POW:%.*]] = call float asm "
 ; CGSCC-NEXT:    ret float [[IMPLEMENT_POW]]
diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll
index 36201b0e96d689..eafb16823fdf14 100644
--- a/llvm/test/Transforms/Attributor/nofree.ll
+++ b/llvm/test/Transforms/Attributor/nofree.ll
@@ -257,8 +257,8 @@ define void @call_floor(float %a) #0 {
 define float @call_floor2(float %a) #0 {
 ; CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
 ; CHECK-LABEL: define {{[^@]+}}@call_floor2
-; CHECK-SAME: (float [[A:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:    [[C:%.*]] = tail call nofpclass(sub) float @llvm.floor.f32(float [[A]]) #[[ATTR14:[0-9]+]]
+; CHECK-SAME: (float nofpclass(sub) [[A:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[C:%.*]] = tail call nofpclass(sub) float @llvm.floor.f32(float nofpclass(sub) [[A]]) #[[ATTR14:[0-9]+]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %c = tail call float @llvm.floor.f32(float %a)
diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll
index 74722893d85a3e..5059666943809d 100644
--- a/llvm/test/Transforms/Attributor/nosync.ll
+++ b/llvm/test/Transforms/Attributor/nosync.ll
@@ -420,8 +420,8 @@ define i32 @cos_test(float %x) {
 define float @cos_test2(float %x) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: define {{[^@]+}}@cos_test2
-; CHECK-SAME: (float [[X:%.*]]) #[[ATTR18]] {
-; CHECK-NEXT:    [[C:%.*]] = call nofpclass(inf) float @llvm.cos.f32(float [[X]]) #[[ATTR23:[0-9]+]]
+; CHECK-SAME: (float nofpclass(inf) [[X:%.*]]) #[[ATTR18]] {
+; CHECK-NEXT:    [[C:%.*]] = call nofpclass(inf) float @llvm.cos.f32(float nofpclass(inf) [[X]]) #[[ATTR23:[0-9]+]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %c = call float @llvm.cos.f32(float %x)
diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll
index 1e15e89473cf09..0cd2339de6f830 100644
--- a/llvm/test/Transforms/Attributor/willreturn.ll
+++ b/llvm/test/Transforms/Attributor/willreturn.ll
@@ -294,8 +294,8 @@ define void @call_floor(float %a) #0 {
 define float @call_floor2(float %a) #0 {
 ; CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
 ; CHECK-LABEL: define {{[^@]+}}@call_floor2
-; CHECK-SAME: (float [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[C:%.*]] = tail call nofpclass(sub) float @llvm.floor.f32(float [[A]]) #[[ATTR30:[0-9]+]]
+; CHECK-SAME: (float nofpclass(sub) [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[C:%.*]] = tail call nofpclass(sub) float @llvm.floor.f32(float nofpclass(sub) [[A]]) #[[ATTR30:[0-9]+]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %c = tail call float @llvm.floor.f32(float %a)
@@ -855,7 +855,7 @@ define i32 @bounded_loop_inside_unbounded_loop(i32 %n) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
 ; CHECK:       while.cond:
-; CHECK-NEXT:    [[ANS_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TRUETMP2:%.*]], [[FOR_END:%.*]] ]
+; CHECK-NEXT:    [[ANS_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[FOR_END:%.*]] ]
 ; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY]] ], [ [[INC:%.*]], [[FOR_END]] ]
 ; CHECK-NEXT:    [[TMP:%.*]] = icmp sgt i32 [[N_ADDR_0]], -1
 ; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP]], i32 [[N_ADDR_0]], i32 -1
@@ -863,12 +863,12 @@ define i32 @bounded_loop_inside_unbounded_loop(i32 %n) {
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[N_ADDR_0]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = add i32 [[ANS_0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[ANS_0]], 1
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[TRUETMP2]] = add i32 [[TRUETMP1]], [[SMAX]]
+; CHECK-NEXT:    [[TMP2]] = add i32 [[TMP1]], [[SMAX]]
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    unreachable
@@ -939,7 +939,7 @@ define i32 @nested_unbounded_loops(i32 %n) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
 ; CHECK:       while.cond:
-; CHECK-NEXT:    [[ANS_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TRUETMP1:%.*]], [[WHILE_END10:%.*]] ]
+; CHECK-NEXT:    [[ANS_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[WHILE_END10:%.*]] ]
 ; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY]] ], [ -1, [[WHILE_END10]] ]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[N_ADDR_0]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END11:%.*]], label [[WHILE_BODY:%.*]]
@@ -957,7 +957,7 @@ define i32 @nested_unbounded_loops(i32 %n) {
 ; CHECK:       while.body8:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       while.end10:
-; CHECK-NEXT:    [[TRUETMP1]] = add i32 [[TMP]], [[ANS_0]]
+; CHECK-NEXT:    [[TMP1]] = add i32 [[TMP]], [[ANS_0]]
 ; CHECK-NEXT:    br label [[WHILE_COND]]
 ; CHECK:       while.end11:
 ; CHECK-NEXT:    [[ANS_0_LCSSA:%.*]] = phi i32 [ [[ANS_0]], [[WHILE_COND]] ]
diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
index 51d9e5f1ba32fc..be7ae0ef3f42c6 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
@@ -1,10 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn--amdhsa < %s | FileCheck %s
 
-; CHECK-LABEL: @no_sink_local_to_flat(
-; CHECK: addrspacecast
-; CHECK: br
-; CHECK-NOT: addrspacecast
 define i64 @no_sink_local_to_flat(i1 %pred, ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define i64 @no_sink_local_to_flat(
+; CHECK-SAME: i1 [[PRED:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
+; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 4
+; CHECK-NEXT:    ret i64 [[V1]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4
+; CHECK-NEXT:    ret i64 [[V2]]
+;
   %ptr_cast = addrspacecast ptr addrspace(3) %ptr to ptr
   br i1 %pred, label %l1, label %l2
 
@@ -17,11 +25,18 @@ l2:
   ret i64 %v2
 }
 
-; CHECK-LABEL: @no_sink_private_to_flat(
-; CHECK: addrspacecast
-; CHECK: br
-; CHECK-NOT: addrspacecast
 define i64 @no_sink_private_to_flat(i1 %pred, ptr addrspace(5) %ptr) {
+; CHECK-LABEL: define i64 @no_sink_private_to_flat(
+; CHECK-SAME: i1 [[PRED:%.*]], ptr addrspace(5) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
+; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 4
+; CHECK-NEXT:    ret i64 [[V1]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4
+; CHECK-NEXT:    ret i64 [[V2]]
+;
   %ptr_cast = addrspacecast ptr addrspace(5) %ptr to ptr
   br i1 %pred, label %l1, label %l2
 
@@ -35,11 +50,18 @@ l2:
 }
 
 
-; CHECK-LABEL: @sink_global_to_flat(
-; CHECK-NOT: addrspacecast
-; CHECK: br
-; CHECK: addrspacecast
 define i64 @sink_global_to_flat(i1 %pred, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i64 @sink_global_to_flat(
+; CHECK-SAME: i1 [[PRED:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT:    ret i64 [[V1]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[TMP1]], align 4
+; CHECK-NEXT:    ret i64 [[V2]]
+;
   %ptr_cast = addrspacecast ptr addrspace(1) %ptr to ptr
   br i1 %pred, label %l1, label %l2
 
@@ -52,11 +74,18 @@ l2:
   ret i64 %v2
 }
 
-; CHECK-LABEL: @sink_flat_to_global(
-; CHECK-NOT: addrspacecast
-; CHECK: br
-; CHECK: addrspacecast
 define i64 @sink_flat_to_global(i1 %pred, ptr %ptr) {
+; CHECK-LABEL: define i64 @sink_flat_to_global(
+; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    ret i64 [[V1]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
+; CHECK-NEXT:    ret i64 [[V2]]
+;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(1)
   br i1 %pred, label %l1, label %l2
 
@@ -69,11 +98,18 @@ l2:
   ret i64 %v2
 }
 
-; CHECK-LABEL: @sink_flat_to_constant(
-; CHECK-NOT: addrspacecast
-; CHECK: br
-; CHECK: addrspacecast
 define i64 @sink_flat_to_constant(i1 %pred, ptr %ptr) {
+; CHECK-LABEL: define i64 @sink_flat_to_constant(
+; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    ret i64 [[V1]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 4
+; CHECK-NEXT:    ret i64 [[V2]]
+;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(4)
   br i1 %pred, label %l1, label %l2
 
@@ -86,11 +122,18 @@ l2:
   ret i64 %v2
 }
 
-; CHECK-LABEL: @sink_flat_to_local(
-; CHECK-NOT: addrspacecast
-; CHECK: br
-; CHECK: addrspacecast
 define i64 @sink_flat_to_local(i1 %pred, ptr %ptr) {
+; CHECK-LABEL: define i64 @sink_flat_to_local(
+; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    ret i64 [[V1]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(3) [[TMP1]], align 4
+; CHECK-NEXT:    ret i64 [[V2]]
+;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(3)
   br i1 %pred, label %l1, label %l2
 
@@ -103,11 +146,18 @@ l2:
   ret i64 %v2
 }
 
-; CHECK-LABEL: @sink_flat_to_private(
-; CHECK-NOT: addrspacecast
-; CHECK: br
-; CHECK: addrspacecast
 define i64 @sink_flat_to_private(i1 %pred, ptr %ptr) {
+; CHECK-LABEL: define i64 @sink_flat_to_private(
+; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    ret i64 [[V1]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT:    ret i64 [[V2]]
+;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(5)
   br i1 %pred, label %l1, label %l2
 
diff --git a/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-decrement.ll b/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-decrement.ll
new file mode 100644
index 00000000000000..4ebbae54965a43
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-decrement.ll
@@ -0,0 +1,260 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -p constraint-elimination -S %s | FileCheck %s
+
+declare void @use(i1)
+declare void @llvm.assume(i1)
+
+define void @add_rec_decreasing_cond_true_constant(i8 noundef %len) {
+; CHECK-LABEL: define void @add_rec_decreasing_cond_true_constant(
+; CHECK-SAME: i8 noundef [[LEN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i8 [ 4, [[ENTRY:%.*]] ], [ [[K_DEC:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i8 [[K_0]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i8 [[K_0]], 5
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_DEC]] = add i8 [[K_0]], -1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %k.0 = phi i8 [ 4, %entry], [ %k.dec, %loop.latch ]
+  %cmp2.not = icmp eq i8 %k.0, 0
+  br i1 %cmp2.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp.not.i = icmp ult i8 %k.0, 5
+  call void @use(i1 %cmp.not.i)
+  %k.dec = add i8 %k.0, -1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @add_rec_decreasing_cond_not_true_constant(i8 noundef %len) {
+; CHECK-LABEL: define void @add_rec_decreasing_cond_not_true_constant(
+; CHECK-SAME: i8 noundef [[LEN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i8 [ 4, [[ENTRY:%.*]] ], [ [[K_DEC:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i8 [[K_0]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i8 [[K_0]], 4
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_DEC]] = add i8 [[K_0]], -1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %k.0 = phi i8 [ 4, %entry], [ %k.dec, %loop.latch ]
+  %cmp2.not = icmp eq i8 %k.0, 0
+  br i1 %cmp2.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp.not.i = icmp ult i8 %k.0, 4
+  call void @use(i1 %cmp.not.i)
+  %k.dec = add i8 %k.0, -1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @add_rec_decreasing_cond_true_start_signed_positive(i8 noundef %start) {
+; CHECK-LABEL: define void @add_rec_decreasing_cond_true_start_signed_positive(
+; CHECK-SAME: i8 noundef [[START:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PRECOND:%.*]] = icmp sge i8 [[START]], 1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[PRECOND]])
+; CHECK-NEXT:    [[START_1:%.*]] = add i8 [[START]], -1
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i8 [ [[START_1]], [[ENTRY:%.*]] ], [ [[K_DEC:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i8 [[K_0]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i8 [[K_0]], [[START]]
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_DEC]] = add i8 [[K_0]], -1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %precond = icmp sge i8 %start, 1
+  call void @llvm.assume(i1 %precond)
+  %start.1 = add i8 %start, -1
+  br label %loop.header
+
+loop.header:
+  %k.0 = phi i8 [ %start.1, %entry], [ %k.dec, %loop.latch ]
+  %cmp2.not = icmp eq i8 %k.0, 0
+  br i1 %cmp2.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp.not.i = icmp ult i8 %k.0, %start
+  call void @use(i1 %cmp.not.i)
+  %k.dec = add i8 %k.0, -1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @add_rec_decreasing_cond_not_true_start_signed_positive(i8 noundef %start) {
+; CHECK-LABEL: define void @add_rec_decreasing_cond_not_true_start_signed_positive(
+; CHECK-SAME: i8 noundef [[START:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PRECOND:%.*]] = icmp sge i8 [[START]], 1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[PRECOND]])
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i8 [ [[START]], [[ENTRY:%.*]] ], [ [[K_DEC:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i8 [[K_0]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i8 [[K_0]], [[START]]
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_DEC]] = add i8 [[K_0]], -1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %precond = icmp sge i8 %start, 1
+  call void @llvm.assume(i1 %precond)
+  br label %loop.header
+
+loop.header:
+  %k.0 = phi i8 [ %start, %entry], [ %k.dec, %loop.latch ]
+  %cmp2.not = icmp eq i8 %k.0, 0
+  br i1 %cmp2.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp.not.i = icmp ult i8 %k.0, %start
+  call void @use(i1 %cmp.not.i)
+  %k.dec = add i8 %k.0, -1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @add_rec_decreasing_add_rec_positive_to_negative(i8 noundef %len) {
+; CHECK-LABEL: define void @add_rec_decreasing_add_rec_positive_to_negative(
+; CHECK-SAME: i8 noundef [[LEN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i8 [ 4, [[ENTRY:%.*]] ], [ [[K_DEC:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i8 [[K_0]], -2
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i8 [[K_0]], 5
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_DEC]] = add i8 [[K_0]], -1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %k.0 = phi i8 [ 4, %entry], [ %k.dec, %loop.latch ]
+  %cmp2.not = icmp eq i8 %k.0, -2
+  br i1 %cmp2.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp.not.i = icmp ult i8 %k.0, 5
+  call void @use(i1 %cmp.not.i)
+  %k.dec = add i8 %k.0, -1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @add_rec_decreasing_2_cond_true_constant(i8 noundef %len) {
+; CHECK-LABEL: define void @add_rec_decreasing_2_cond_true_constant(
+; CHECK-SAME: i8 noundef [[LEN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i8 [ 4, [[ENTRY:%.*]] ], [ [[K_DEC:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i8 [[K_0]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i8 [[K_0]], 5
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_DEC]] = add i8 [[K_0]], -2
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %k.0 = phi i8 [ 4, %entry], [ %k.dec, %loop.latch ]
+  %cmp2.not = icmp eq i8 %k.0, 0
+  br i1 %cmp2.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp.not.i = icmp ult i8 %k.0, 5
+  call void @use(i1 %cmp.not.i)
+  %k.dec = add i8 %k.0, -2
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @add_rec_decreasing_2_cond_not_true_constant(i8 noundef %len) {
+; CHECK-LABEL: define void @add_rec_decreasing_2_cond_not_true_constant(
+; CHECK-SAME: i8 noundef [[LEN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i8 [ 5, [[ENTRY:%.*]] ], [ [[K_DEC:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i8 [[K_0]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i8 [[K_0]], 5
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_DEC]] = add i8 [[K_0]], -2
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %k.0 = phi i8 [ 5, %entry], [ %k.dec, %loop.latch ]
+  %cmp2.not = icmp eq i8 %k.0, 0
+  br i1 %cmp2.not, label %exit, label %loop.latch
+
+loop.latch:
+  %cmp.not.i = icmp ult i8 %k.0, 5
+  call void @use(i1 %cmp.not.i)
+  %k.dec = add i8 %k.0, -2
+  br label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-nested-loops.ll b/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-nested-loops.ll
new file mode 100644
index 00000000000000..e5d101f7fdea10
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/monotonic-int-phis-nested-loops.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -p constraint-elimination -S %s | FileCheck %s
+
+declare void @use(i1)
+
+define void @start_value_of_inner_add_rec_is_add_rec_condition_can_be_simplified(i32 noundef %len) {
+; CHECK-LABEL: define void @start_value_of_inner_add_rec_is_add_rec_condition_can_be_simplified(
+; CHECK-SAME: i32 noundef [[LEN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[INNER_HEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i32 [ [[I_0]], [[OUTER_HEADER]] ], [ [[K_INC:%.*]], [[INNER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i32 [[K_0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[OUTER_LATCH]], label [[INNER_LATCH]]
+; CHECK:       inner.latch:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[K_INC]] = add i32 [[K_0]], 1
+; CHECK-NEXT:    br label [[INNER_HEADER]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %i.0 = phi i32 [ 1, %entry ], [ %i.inc, %outer.latch ]
+  %cmp = icmp ult i32 %i.0, %len
+  br i1 %cmp, label %inner.header, label %exit
+
+inner.header:
+  %k.0 = phi i32 [ %i.0, %outer.header ], [ %k.inc, %inner.latch ]
+  %cmp2.not = icmp eq i32 %k.0, %len
+  br i1 %cmp2.not, label %outer.latch, label %inner.latch
+
+inner.latch:
+  %cmp.not.i = icmp ult i32 %k.0, %len
+  call void @use(i1 %cmp.not.i)
+  %k.inc = add i32 %k.0, 1
+  br label %inner.header
+
+outer.latch:
+  %i.inc = add i32 %i.0, 1
+  br label %outer.header
+
+exit:
+  ret void
+}
+
+define void @start_value_of_inner_add_rec_is_add_rec_condition_cannot_be_simplified(i32 noundef %len) {
+; CHECK-LABEL: define void @start_value_of_inner_add_rec_is_add_rec_condition_cannot_be_simplified(
+; CHECK-SAME: i32 noundef [[LEN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[INNER_HEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i32 [ [[I_0]], [[OUTER_HEADER]] ], [ [[K_INC:%.*]], [[INNER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i32 [[K_0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[OUTER_LATCH]], label [[INNER_LATCH]]
+; CHECK:       inner.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i32 [[K_0]], 3
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_INC]] = add i32 [[K_0]], 1
+; CHECK-NEXT:    br label [[INNER_HEADER]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %i.0 = phi i32 [ 1, %entry ], [ %i.inc, %outer.latch ]
+  %cmp = icmp ult i32 %i.0, %len
+  br i1 %cmp, label %inner.header, label %exit
+
+inner.header:
+  %k.0 = phi i32 [ %i.0, %outer.header ], [ %k.inc, %inner.latch ]
+  %cmp2.not = icmp eq i32 %k.0, %len
+  br i1 %cmp2.not, label %outer.latch, label %inner.latch
+
+inner.latch:
+  %cmp.not.i = icmp ult i32 %k.0, 3
+  call void @use(i1 %cmp.not.i)
+  %k.inc = add i32 %k.0, 1
+  br label %inner.header
+
+outer.latch:
+  %i.inc = add i32 %i.0, 1
+  br label %outer.header
+
+exit:
+  ret void
+}
+define void @inner_add_rec_decreasing(i32 noundef %len) {
+; CHECK-LABEL: define void @inner_add_rec_decreasing(
+; CHECK-SAME: i32 noundef [[LEN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_0]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[INNER_HEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.header:
+; CHECK-NEXT:    [[K_0:%.*]] = phi i32 [ [[I_0]], [[OUTER_HEADER]] ], [ [[K_DEC:%.*]], [[INNER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i32 [[K_0]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[OUTER_LATCH]], label [[INNER_LATCH]]
+; CHECK:       inner.latch:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ult i32 [[K_0]], [[LEN]]
+; CHECK-NEXT:    call void @use(i1 [[CMP_NOT_I]])
+; CHECK-NEXT:    [[K_DEC]] = add i32 [[K_0]], -1
+; CHECK-NEXT:    br label [[INNER_HEADER]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %i.0 = phi i32 [ 1, %entry ], [ %i.inc, %outer.latch ]
+  %cmp = icmp ult i32 %i.0, %len
+  br i1 %cmp, label %inner.header, label %exit
+
+inner.header:
+  %k.0 = phi i32 [ %i.0, %outer.header ], [ %k.dec, %inner.latch ]
+  %cmp2.not = icmp eq i32 %k.0, 0
+  br i1 %cmp2.not, label %outer.latch, label %inner.latch
+
+inner.latch:
+  %cmp.not.i = icmp ult i32 %k.0, %len
+  call void @use(i1 %cmp.not.i)
+  %k.dec = add i32 %k.0, -1
+  br label %inner.header
+
+outer.latch:
+  %i.inc = add i32 %i.0, 1
+  br label %outer.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
index 7e89f864c8110e..5c10fdcd2b4921 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -1010,6 +1010,123 @@ else:
   ret i1 %res2
 }
 
+define i1 @cttz_fold(i16 %x) {
+; CHECK-LABEL: @cttz_fold(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X:%.*]], 256
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = icmp uge i16 [[CTTZ]], 8
+; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp ult i16 %x, 256
+  br i1 %cmp, label %if, label %else
+
+if:
+  %cttz = call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %res = icmp uge i16 %cttz, 8
+  ret i1 %res
+
+else:
+  ret i1 false
+}
+
+define i1 @cttz_nofold1(i16 %x) {
+; CHECK-LABEL: @cttz_nofold1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X:%.*]], 256
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = icmp uge i16 [[CTTZ]], 7
+; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp ult i16 %x, 256
+  br i1 %cmp, label %if, label %else
+
+if:
+  %cttz = call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %res = icmp uge i16 %cttz, 7
+  ret i1 %res
+
+else:
+  ret i1 false
+}
+
+define i1 @cttz_nofold2(i16 %x) {
+; CHECK-LABEL: @cttz_nofold2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X:%.*]], 256
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X]], i1 false)
+; CHECK-NEXT:    [[RES:%.*]] = icmp uge i16 [[CTTZ]], 8
+; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp ult i16 %x, 256
+  br i1 %cmp, label %if, label %else
+
+if:
+  %cttz = call i16 @llvm.cttz.i16(i16 %x, i1 false)
+  %res = icmp uge i16 %cttz, 8
+  ret i1 %res
+
+else:
+  ret i1 false
+}
+
+define i1 @ctpop_fold(i16 %x) {
+; CHECK-LABEL: @ctpop_fold(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X:%.*]], 256
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[CTPOP:%.*]] = call i16 @llvm.ctpop.i16(i16 [[X]])
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i16 [[CTPOP]], 8
+; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ult i16 %x, 256
+  br i1 %cmp, label %if, label %else
+
+if:
+  %ctpop = call i16 @llvm.ctpop.i16(i16 %x)
+  %res = icmp ule i16 %ctpop, 8
+  ret i1 %res
+
+else:
+  ret i1 true
+}
+
+define i1 @ctpop_nofold(i16 %x) {
+; CHECK-LABEL: @ctpop_nofold(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X:%.*]], 256
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[CTPOP:%.*]] = call i16 @llvm.ctpop.i16(i16 [[X]])
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i16 [[CTPOP]], 7
+; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ult i16 %x, 256
+  br i1 %cmp, label %if, label %else
+
+if:
+  %ctpop = call i16 @llvm.ctpop.i16(i16 %x)
+  %res = icmp ule i16 %ctpop, 7
+  ret i1 %res
+
+else:
+  ret i1 true
+}
+
 declare i16 @llvm.ctlz.i16(i16, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i16 @llvm.ctpop.i16(i16)
 declare i16 @llvm.abs.i16(i16, i1)
 declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/GVN/pr48805.ll b/llvm/test/Transforms/GVN/pr48805.ll
new file mode 100644
index 00000000000000..e3fdc280255fa8
--- /dev/null
+++ b/llvm/test/Transforms/GVN/pr48805.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=gvn < %s | FileCheck %s
+
+declare void @willreturn() nounwind willreturn
+
+declare void @capture(ptr)
+
+; Make sure ICF is invalidated when the callee becomes known.
+
+define i64 @test(ptr %p) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [2 x ptr], align 8
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr ptr, ptr [[A]], i64 1
+; CHECK-NEXT:    call void @capture(ptr [[A]])
+; CHECK-NEXT:    br i1 false, label [[IF:%.*]], label [[ENTRY_EXIT_CRIT_EDGE:%.*]]
+; CHECK:       entry.exit_crit_edge:
+; CHECK-NEXT:    [[RES_PRE:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[P1:%.*]] = load ptr, ptr [[A2]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[RES_PRE]], [[ENTRY_EXIT_CRIT_EDGE]] ], [ undef, [[IF]] ]
+; CHECK-NEXT:    store ptr @willreturn, ptr [[P]], align 8
+; CHECK-NEXT:    tail call void @willreturn()
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %a = alloca [2 x ptr], align 8
+  %a2 = getelementptr ptr, ptr %a, i64 1
+  call void @capture(ptr %a)
+  br i1 false, label %if, label %exit
+
+if:
+  %p0 = load ptr, ptr %a, align 8
+  %p1 = load ptr, ptr %a2, align 8
+  br label %exit
+
+exit:
+  store ptr @willreturn, ptr %p
+  %p2 = load ptr, ptr %a, align 8
+  %pgocount.i = load i64, ptr %p2, align 8
+  %fn = load ptr, ptr %p
+  tail call void %fn()
+  %res = load i64, ptr %a2, align 8
+  ret i64 %res
+}
diff --git a/llvm/test/Transforms/HotColdSplit/invalid-dbg-assign.ll b/llvm/test/Transforms/HotColdSplit/invalid-dbg-assign.ll
new file mode 100644
index 00000000000000..3163fc7071e978
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/invalid-dbg-assign.ll
@@ -0,0 +1,37 @@
+; RUN: opt -passes=hotcoldsplit -hotcoldsplit-threshold=-1 -S %s | FileCheck %s
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+; CHECK: define void @foo
+; CHECK-NOT: dbg.assign
+; CHECK: call void @foo.cold
+; CHECK-NOT: dbg.assign
+; CHECK: define internal void @foo.cold
+; CHECK-NOT: dbg.assign
+define void @foo() !dbg !10 {
+  %buf.i = alloca i32, align 4, !DIAssignID !8
+  br i1 false, label %if.else, label %if.then
+
+if.then:                                          ; preds = %0
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !9, metadata !DIExpression(), metadata !8, metadata ptr %buf.i, metadata !DIExpression()), !dbg !14
+  unreachable
+
+if.else:                                          ; preds = %0
+  ret void
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "blah", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2)
+!1 = !DIFile(filename: "blah", directory: "blah")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = distinct !DIAssignID()
+!9 = !DILocalVariable(name: "buf", scope: !10, file: !1, line: 1774, type: !13)
+!10 = distinct !DISubprogram(name: "blah", scope: !1, file: !1, line: 1771, type: !11, scopeLine: 1773, unit: !0, retainedNodes: !2)
+!11 = !DISubroutineType(cc: DW_CC_nocall, types: !7)
+!13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_unsigned_char)
+!14 = !DILocation(line: 0, scope: !10)
diff --git a/llvm/test/Transforms/Inline/ML/coro-split-func-levels.ll b/llvm/test/Transforms/Inline/ML/coro-split-func-levels.ll
new file mode 100644
index 00000000000000..79e1ebeec17b92
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/coro-split-func-levels.ll
@@ -0,0 +1,41 @@
+; REQUIRES: llvm_inliner_model_autogenerated
+; RUN: opt -S -passes='coro-early,scc-oz-module-inliner,print<inline-advisor>' \
+; RUN:  -enable-ml-inliner=release -keep-inline-advisor-for-printing < %s
+
+define void @_Z5get_sv() presplitcoroutine {
+  %1 = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %2 = call ptr @llvm.coro.begin(token %1, ptr null)
+  %3 = call token @llvm.coro.save(ptr null)
+  %4 = call i8 @llvm.coro.suspend(token none, i1 false)
+  call void @_ZN1S12promise_typeD2Ev()
+  ret void
+}
+
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
+declare ptr @llvm.coro.begin(token, ptr writeonly)
+declare token @llvm.coro.save(ptr)
+declare i8 @llvm.coro.suspend(token, i1)
+
+declare void @__clang_call_terminate()
+
+define void @_ZN1S12promise_typeD2Ev() personality ptr null {
+  invoke void @_Z4funcv()
+          to label %1 unwind label %2
+
+1:                                                ; preds = %0
+  ret void
+
+2:                                                ; preds = %0
+  %3 = landingpad { ptr, i32 }
+          catch ptr null
+  call void @__clang_call_terminate()
+  unreachable
+}
+declare void @_Z4funcv()
+
+; CHECK:      [MLInlineAdvisor] FuncLevels:
+; CHECK-NEXT: _Z5get_sv : 1
+; CHECK-NEXT: _ZN1S12promise_typeD2Ev : 0
+; CHECK-NEXT: _Z5get_sv.resume : 1
+; CHECK-NEXT: _Z5get_sv.destroy : 1
+; CHECK-NEXT: _Z5get_sv.cleanup : 1
diff --git a/llvm/test/Transforms/Inline/ML/state-tracking-coro.ll b/llvm/test/Transforms/Inline/ML/state-tracking-coro.ll
index 40731ec83a70e4..bbce9e43b10c98 100644
--- a/llvm/test/Transforms/Inline/ML/state-tracking-coro.ll
+++ b/llvm/test/Transforms/Inline/ML/state-tracking-coro.ll
@@ -1,9 +1,10 @@
 ; Based on llvm/test/Transforms/Coroutines/coro-split-02.ll
 ; Corosplit will keep f1 and add 3 more functions.
 ; RUN: opt -passes='default<O1>,print<inline-advisor>' -training-log=/dev/null \
-; RUN:   -S -enable-ml-inliner=development -keep-inline-advisor-for-printing < %s 2>&1 | FileCheck %s
+; RUN:   -S -enable-ml-inliner=development -enable-scc-inline-advisor-printing < %s 2>&1 | FileCheck %s
 ; REQUIRES: have_tflite
 ;
+; CHECK: [MLInlineAdvisor] Nodes: 1 Edges: 0
 ; CHECK: [MLInlineAdvisor] Nodes: 4 Edges: 0
 
 %"struct.std::coroutine_handle" = type { ptr }
diff --git a/llvm/test/Transforms/Inline/byval.ll b/llvm/test/Transforms/Inline/byval.ll
index d7e6efcb1a2c38..dd5be40b90a8f2 100644
--- a/llvm/test/Transforms/Inline/byval.ll
+++ b/llvm/test/Transforms/Inline/byval.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt < %s -passes=inline -S | FileCheck %s
 ; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s
 
@@ -10,34 +11,46 @@ target datalayout = "p:32:32-p1:64:64-p2:16:16-n16:32:64"
 
 ; Inlining a byval struct should cause an explicit copy into an alloca.
 
-	%struct.ss = type { i32, i64 }
+  %struct.ss = type { i32, i64 }
 @.str = internal constant [10 x i8] c"%d, %lld\0A\00"		; <ptr> [#uses=1]
 
 define internal void @f(ptr byval(%struct.ss)  %b) nounwind  {
 entry:
-	%tmp = getelementptr %struct.ss, ptr %b, i32 0, i32 0		; <ptr> [#uses=2]
-	%tmp1 = load i32, ptr %tmp, align 4		; <i32> [#uses=1]
-	%tmp2 = add i32 %tmp1, 1		; <i32> [#uses=1]
-	store i32 %tmp2, ptr %tmp, align 4
-	ret void
+  %tmp = getelementptr %struct.ss, ptr %b, i32 0, i32 0		; <ptr> [#uses=2]
+  %tmp1 = load i32, ptr %tmp, align 4		; <i32> [#uses=1]
+  %tmp2 = add i32 %tmp1, 1		; <i32> [#uses=1]
+  store i32 %tmp2, ptr %tmp, align 4
+  ret void
 }
 
 declare i32 @printf(ptr, ...) nounwind
 
 define i32 @test1() nounwind  {
+; CHECK-LABEL: define i32 @test1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S1:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
+; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 0
+; CHECK-NEXT:    store i32 1, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CHECK-NEXT:    store i64 2, ptr [[TMP4]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[S1]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1]], ptr align 1 [[S]], i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1_I:%.*]] = load i32, ptr [[S1]], align 4
+; CHECK-NEXT:    [[TMP2_I:%.*]] = add i32 [[TMP1_I]], 1
+; CHECK-NEXT:    store i32 [[TMP2_I]], ptr [[S1]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[S1]])
+; CHECK-NEXT:    ret i32 0
+;
 entry:
-	%S = alloca %struct.ss		; <ptr> [#uses=4]
-	%tmp1 = getelementptr %struct.ss, ptr %S, i32 0, i32 0		; <ptr> [#uses=1]
-	store i32 1, ptr %tmp1, align 8
-	%tmp4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1		; <ptr> [#uses=1]
-	store i64 2, ptr %tmp4, align 4
-	call void @f(ptr byval(%struct.ss) %S) nounwind
-	ret i32 0
-; CHECK: @test1()
-; CHECK: %S1 = alloca %struct.ss
-; CHECK: %S = alloca %struct.ss
-; CHECK: call void @llvm.memcpy
-; CHECK: ret i32 0
+  %S = alloca %struct.ss		; <ptr> [#uses=4]
+  %tmp1 = getelementptr %struct.ss, ptr %S, i32 0, i32 0		; <ptr> [#uses=1]
+  store i32 1, ptr %tmp1, align 8
+  %tmp4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1		; <ptr> [#uses=1]
+  store i64 2, ptr %tmp4, align 4
+  call void @f(ptr byval(%struct.ss) %S) nounwind
+  ret i32 0
 }
 
 ; Inlining a byval struct should NOT cause an explicit copy
@@ -45,25 +58,33 @@ entry:
 
 define internal i32 @f2(ptr byval(%struct.ss)  %b) nounwind readonly {
 entry:
-	%tmp = getelementptr %struct.ss, ptr %b, i32 0, i32 0		; <ptr> [#uses=2]
-	%tmp1 = load i32, ptr %tmp, align 4		; <i32> [#uses=1]
-	%tmp2 = add i32 %tmp1, 1		; <i32> [#uses=1]
-	ret i32 %tmp2
+  %tmp = getelementptr %struct.ss, ptr %b, i32 0, i32 0		; <ptr> [#uses=2]
+  %tmp1 = load i32, ptr %tmp, align 4		; <i32> [#uses=1]
+  %tmp2 = add i32 %tmp1, 1		; <i32> [#uses=1]
+  ret i32 %tmp2
 }
 
 define i32 @test2() nounwind  {
+; CHECK-LABEL: define i32 @test2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 0
+; CHECK-NEXT:    store i32 1, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CHECK-NEXT:    store i64 2, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP1_I:%.*]] = load i32, ptr [[S]], align 4
+; CHECK-NEXT:    [[TMP2_I:%.*]] = add i32 [[TMP1_I]], 1
+; CHECK-NEXT:    ret i32 [[TMP2_I]]
+;
 entry:
-	%S = alloca %struct.ss		; <ptr> [#uses=4]
-	%tmp1 = getelementptr %struct.ss, ptr %S, i32 0, i32 0		; <ptr> [#uses=1]
-	store i32 1, ptr %tmp1, align 8
-	%tmp4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1		; <ptr> [#uses=1]
-	store i64 2, ptr %tmp4, align 4
-	%X = call i32 @f2(ptr byval(%struct.ss) %S) nounwind
-	ret i32 %X
-; CHECK: @test2()
-; CHECK: %S = alloca %struct.ss
-; CHECK-NOT: call void @llvm.memcpy
-; CHECK: ret i32
+  %S = alloca %struct.ss		; <ptr> [#uses=4]
+  %tmp1 = getelementptr %struct.ss, ptr %S, i32 0, i32 0		; <ptr> [#uses=1]
+  store i32 1, ptr %tmp1, align 8
+  %tmp4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1		; <ptr> [#uses=1]
+  store i64 2, ptr %tmp4, align 4
+  %X = call i32 @f2(ptr byval(%struct.ss) %S) nounwind
+  ret i32 %X
 }
 
 
@@ -73,21 +94,26 @@ entry:
 declare void @g3(ptr %p)
 
 define internal void @f3(ptr byval(%struct.ss) align 64 %b) nounwind {
-   call void @g3(ptr %b)  ;; Could make alignment assumptions!
-   ret void
+  call void @g3(ptr %b)  ;; Could make alignment assumptions!
+  ret void
 }
 
 define void @test3() nounwind  {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S1:%.*]] = alloca [[STRUCT_SS:%.*]], align 64
+; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[S1]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1]], ptr align 1 [[S]], i64 12, i1 false)
+; CHECK-NEXT:    call void @g3(ptr [[S1]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[S1]])
+; CHECK-NEXT:    ret void
+;
 entry:
-	%S = alloca %struct.ss, align 1  ;; May not be aligned.
-	call void @f3(ptr byval(%struct.ss) align 64 %S) nounwind
-	ret void
-; CHECK: @test3()
-; CHECK: %S1 = alloca %struct.ss, align 64
-; CHECK: %S = alloca %struct.ss
-; CHECK: call void @llvm.memcpy
-; CHECK: call void @g3(ptr %S1)
-; CHECK: ret void
+  %S = alloca %struct.ss, align 1  ;; May not be aligned.
+  call void @f3(ptr byval(%struct.ss) align 64 %S) nounwind
+  ret void
 }
 
 
@@ -96,20 +122,22 @@ entry:
 ; alignment to satisfy an explicit alignment request.
 
 define internal i32 @f4(ptr byval(%struct.ss) align 64 %b) nounwind readonly {
-        call void @g3(ptr %b)
-	ret i32 4
+  call void @g3(ptr %b)
+  ret i32 4
 }
 
 define i32 @test4() nounwind  {
+; CHECK-LABEL: define i32 @test4(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 64
+; CHECK-NEXT:    call void @g3(ptr [[S]]) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 4
+;
 entry:
-	%S = alloca %struct.ss, align 2		; <ptr> [#uses=4]
-	%X = call i32 @f4(ptr byval(%struct.ss) align 64 %S) nounwind
-	ret i32 %X
-; CHECK: @test4()
-; CHECK: %S = alloca %struct.ss, align 64
-; CHECK-NOT: call void @llvm.memcpy
-; CHECK: call void @g3
-; CHECK: ret i32 4
+  %S = alloca %struct.ss, align 2		; <ptr> [#uses=4]
+  %X = call i32 @f4(ptr byval(%struct.ss) align 64 %S) nounwind
+  ret i32 %X
 }
 
 %struct.S0 = type { i32 }
@@ -119,20 +147,29 @@ entry:
 
 define internal void @f5(ptr byval(%struct.S0) nocapture readonly align 4 %p) {
 entry:
-	store i32 0, ptr @b, align 4
-	%0 = load i32, ptr %p, align 4
-	store i32 %0, ptr @a, align 4
-	ret void
+  store i32 0, ptr @b, align 4
+  %0 = load i32, ptr %p, align 4
+  store i32 %0, ptr @a, align 4
+  ret void
 }
 
 define i32 @test5() {
+; CHECK-LABEL: define i32 @test5() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_S0:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[B]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[B]], ptr align 1 @b, i64 4, i1 false)
+; CHECK-NEXT:    store i32 0, ptr @b, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr @a, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
 entry:
-	tail call void @f5(ptr byval(%struct.S0) align 4 @b)
-	%0 = load i32, ptr @a, align 4
-	ret i32 %0
-; CHECK: @test5()
-; CHECK: store i32 0, ptr @b, align 4
-; CHECK-NOT: load i32, ptr @b, align 4
+  tail call void @f5(ptr byval(%struct.S0) align 4 @b)
+  %0 = load i32, ptr @a, align 4
+  ret i32 %0
 }
 
 ; Inlining a byval struct that is in a different address space compared to the
@@ -146,18 +183,30 @@ entry:
 @c = common addrspace(1) global i32 0, align 4
 
 define internal void @f5_as1(ptr addrspace(1) byval(%struct.S1) nocapture readonly align 4 %p) {
+; CHECK-LABEL: define internal void @f5_as1(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly byval([[STRUCT_S1:%.*]]) align 4 [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) @d, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) @c, align 4
+; CHECK-NEXT:    ret void
+;
 entry:
-	store i32 0, ptr addrspace(1) @d, align 4
-	%0 = load i32, ptr addrspace(1) %p, align 4
-	store i32 %0, ptr addrspace(1) @c, align 4
-	ret void
+  store i32 0, ptr addrspace(1) @d, align 4
+  %0 = load i32, ptr addrspace(1) %p, align 4
+  store i32 %0, ptr addrspace(1) @c, align 4
+  ret void
 }
 
 define i32 @test5_as1() {
+; CHECK-LABEL: define i32 @test5_as1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @f5_as1(ptr addrspace(1) byval([[STRUCT_S1:%.*]]) align 4 @d)
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(1) @c, align 4
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
 entry:
-	tail call void @f5_as1(ptr addrspace(1) byval(%struct.S1) align 4 @d)
-	%0 = load i32, ptr addrspace(1) @c, align 4
-	ret i32 %0
-; CHECK: @test5_as1()
-; CHECK: call void @f5_as1
+  tail call void @f5_as1(ptr addrspace(1) byval(%struct.S1) align 4 @d)
+  %0 = load i32, ptr addrspace(1) @c, align 4
+  ret i32 %0
 }
diff --git a/llvm/test/Transforms/Inline/inline-cost-benefit-multiplier-override.ll b/llvm/test/Transforms/Inline/inline-cost-benefit-multiplier-override.ll
new file mode 100644
index 00000000000000..d367b74358fa1c
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-cost-benefit-multiplier-override.ll
@@ -0,0 +1,67 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -pass-remarks=inline -pass-remarks-missed=inline -inline-savings-multiplier=4 -inline-savings-profitable-multiplier=5 -S 2>&1| FileCheck %s
+
+; Test that inline cost benefit multipler could be configured from command line.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; @inlined_caleee is inlined by cost-benefit anlysis.
+; @not_inlined_callee is not inlined, decided by cost-benefit analysis
+; CHECK: remark: <unknown>:0:0: 'inlined_callee' inlined into 'caller' with (cost=always): benefit over cost
+; CHECK: remark: <unknown>:0:0: 'not_inlined_callee' not inlined into 'caller' because it should never be inlined (cost=never): cost over benefit
+
+define i32 @inlined_callee(i32 %c) !prof !17 {
+entry:
+  %mul = mul nsw i32 %c, %c
+  ret i32 %mul
+}
+
+define i32 @not_inlined_callee(i32 %c) !prof !18 {
+entry:
+  %add = add nsw i32 %c, 2
+  ret i32 %add
+}
+
+define i32 @caller(i32 %a, i32 %c)  !prof !15 {
+entry:
+  %rem = srem i32 %a, 3
+  %cmp = icmp eq i32 %rem, 0
+  br i1 %cmp, label %if.then, label %if.end, !prof !16
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: call i32 @inlined_callee
+  %call = tail call i32 @inlined_callee(i32 %c) "inline-cycle-savings-for-test"="26" "inline-runtime-cost-for-test"="1"
+  br label %return
+
+if.end:
+; CHECK-LABEL: if.end:
+; CHECK: call i32 @not_inlined_callee
+  %call1 = tail call i32 @not_inlined_callee(i32 %c) "inline-cycle-savings-for-test"="19" "inline-runtime-cost-for-test"="1"
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.end ]
+  ret i32 %retval.0
+}
+
+!llvm.module.flags = !{!1}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 990000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"function_entry_count", i64 500}
+!16 = !{!"branch_weights", i32 1, i32 2}
+!17 = !{!"function_entry_count", i64 200}
+!18 = !{!"function_entry_count", i64 400}
diff --git a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
index ddb94ef3fe4e07..c038ffccf3e96d 100644
--- a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
+++ b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
@@ -30,7 +30,7 @@ define ptr @caller0() {
 
 define ptr @caller1() {
 ; CHECK-LABEL: define ptr @caller1() {
-; CHECK-NEXT:    [[R_I:%.*]] = call ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call align 16 ptr @foo()
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
   %r = call align(16) ptr @callee0123()
@@ -39,7 +39,7 @@ define ptr @caller1() {
 
 define ptr @caller2() {
 ; CHECK-LABEL: define ptr @caller2() {
-; CHECK-NEXT:    [[R_I:%.*]] = call ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef ptr @foo()
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
   %r = call noundef ptr @callee0123()
@@ -57,7 +57,7 @@ define ptr @caller3() {
 
 define ptr @caller_0123_dornull() {
 ; CHECK-LABEL: define ptr @caller_0123_dornull() {
-; CHECK-NEXT:    [[R_I:%.*]] = call dereferenceable_or_null(16) ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef align 32 dereferenceable_or_null(16) ptr @foo()
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
   %r = call noundef align(32) dereferenceable_or_null(16) ptr @callee0123()
@@ -66,7 +66,7 @@ define ptr @caller_0123_dornull() {
 
 define ptr @caller_0123_d() {
 ; CHECK-LABEL: define ptr @caller_0123_d() {
-; CHECK-NEXT:    [[R_I:%.*]] = call dereferenceable(16) ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef align 32 dereferenceable(16) ptr @foo()
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
   %r = call noundef align(32) dereferenceable(16) ptr @callee0123()
@@ -105,7 +105,7 @@ define ptr @callee5() {
 
 define ptr @caller5_fail() {
 ; CHECK-LABEL: define ptr @caller5_fail() {
-; CHECK-NEXT:    [[R_I:%.*]] = call align 64 ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef align 64 ptr @foo()
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
   %r = call noundef align(32) ptr @callee5()
@@ -114,7 +114,7 @@ define ptr @caller5_fail() {
 
 define ptr @caller5_okay() {
 ; CHECK-LABEL: define ptr @caller5_okay() {
-; CHECK-NEXT:    [[R_I:%.*]] = call align 64 ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef align 128 ptr @foo()
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
   %r = call noundef align(128) ptr @callee5()
@@ -132,7 +132,7 @@ define ptr @callee6() {
 
 define ptr @caller6_fail() {
 ; CHECK-LABEL: define ptr @caller6_fail() {
-; CHECK-NEXT:    [[R_I:%.*]] = call dereferenceable(8) ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call dereferenceable(16) ptr @foo()
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
   %r = call dereferenceable(8) ptr @callee6()
@@ -159,7 +159,7 @@ define ptr @callee7() {
 
 define ptr @caller7_fail() {
 ; CHECK-LABEL: define ptr @caller7_fail() {
-; CHECK-NEXT:    [[R_I:%.*]] = call dereferenceable_or_null(8) ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call dereferenceable_or_null(16) ptr @foo()
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
   %r = call dereferenceable_or_null(8) ptr @callee7()
@@ -249,7 +249,7 @@ define ptr @caller10_fail_maybe_poison() {
 
 define ptr @caller10_okay_will_be_ub() {
 ; CHECK-LABEL: define ptr @caller10_okay_will_be_ub() {
-; CHECK-NEXT:    [[R_I:%.*]] = call nonnull ptr @foo()
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef nonnull ptr @foo()
 ; CHECK-NEXT:    call void @use.ptr(ptr [[R_I]])
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
diff --git a/llvm/test/Transforms/Inline/ret_attr_update.ll b/llvm/test/Transforms/Inline/ret_attr_update.ll
index bba7147ad00467..7004b5795125d1 100644
--- a/llvm/test/Transforms/Inline/ret_attr_update.ll
+++ b/llvm/test/Transforms/Inline/ret_attr_update.ll
@@ -153,7 +153,7 @@ define internal ptr @callee6(ptr %p) alwaysinline {
 define ptr @test6(ptr %ptr, i64 %x) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[X:%.*]]
-; CHECK-NEXT:    [[R_I:%.*]] = call dereferenceable_or_null(12) ptr @foo(ptr [[GEP]])
+; CHECK-NEXT:    [[R_I:%.*]] = call dereferenceable_or_null(16) ptr @foo(ptr [[GEP]])
 ; CHECK-NEXT:    [[V_I:%.*]] = call ptr @baz(ptr [[GEP]])
 ; CHECK-NEXT:    ret ptr [[R_I]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll
index 90f027010e2aea..528a4e62c192d0 100644
--- a/llvm/test/Transforms/InstCombine/and.ll
+++ b/llvm/test/Transforms/InstCombine/and.ll
@@ -2546,3 +2546,39 @@ define i32 @and_zext_eq_zero(i32 %A, i32 %C)  {
   %5 = and i32 %2, %4
   ret i32 %5
 }
+
+define i32 @canonicalize_and_add_power2_or_zero(i32 %x, i32 %y) {
+; CHECK-LABEL: @canonicalize_and_add_power2_or_zero(
+; CHECK-NEXT:    [[NY:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = and i32 [[NY]], [[Y]]
+; CHECK-NEXT:    call void @use32(i32 [[P2]])
+; CHECK-NEXT:    [[VAL:%.*]] = add i32 [[P2]], [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[VAL]], [[P2]]
+; CHECK-NEXT:    ret i32 [[AND]]
+;
+  %ny = sub i32 0, %y
+  %p2 = and i32 %y, %ny
+  call void @use32(i32 %p2) ; keep p2
+
+  %val = add i32 %x, %p2
+  %and = and i32 %val, %p2
+  ret i32 %and
+}
+
+define i32 @canonicalize_and_sub_power2_or_zero(i32 %x, i32 %y) {
+; CHECK-LABEL: @canonicalize_and_sub_power2_or_zero(
+; CHECK-NEXT:    [[NY:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = and i32 [[NY]], [[Y]]
+; CHECK-NEXT:    call void @use32(i32 [[P2]])
+; CHECK-NEXT:    [[VAL:%.*]] = sub i32 [[X:%.*]], [[P2]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[VAL]], [[P2]]
+; CHECK-NEXT:    ret i32 [[AND]]
+;
+  %ny = sub i32 0, %y
+  %p2 = and i32 %y, %ny
+  call void @use32(i32 %p2) ; keep p2
+
+  %val = sub i32 %x, %p2
+  %and = and i32 %val, %p2
+  ret i32 %and
+}
diff --git a/llvm/test/Transforms/InstCombine/binop-select.ll b/llvm/test/Transforms/InstCombine/binop-select.ll
index a59e19897f061d..6cd4132eadd77b 100644
--- a/llvm/test/Transforms/InstCombine/binop-select.ll
+++ b/llvm/test/Transforms/InstCombine/binop-select.ll
@@ -324,12 +324,12 @@ define i32 @sub_sel_op1_use(i1 %b) {
 ; CHECK-LABEL: @sub_sel_op1_use(
 ; CHECK-NEXT:    [[S:%.*]] = select i1 [[B:%.*]], i32 42, i32 41
 ; CHECK-NEXT:    call void @use(i32 [[S]])
-; CHECK-NEXT:    [[R:%.*]] = sub nsw i32 42, [[S]]
+; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw i32 42, [[S]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %s = select i1 %b, i32 42, i32 41
   call void @use(i32 %s)
-  %r = sub nsw i32 42, %s
+  %r = sub nuw nsw i32 42, %s
   ret i32 %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/constant-fold-shifts.ll b/llvm/test/Transforms/InstCombine/constant-fold-shifts.ll
index 5b2c8d02797544..57d6144bbee29e 100644
--- a/llvm/test/Transforms/InstCombine/constant-fold-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/constant-fold-shifts.ll
@@ -8,7 +8,7 @@
 define void @ossfuzz_14169_test1(ptr %a0) {
 ; CHECK-LABEL: @ossfuzz_14169_test1(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    store ptr undef, ptr undef, align 8
+; CHECK-NEXT:    store ptr poison, ptr undef, align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -24,7 +24,7 @@ bb:
 define void @ossfuzz_14169_test2(ptr %a0) {
 ; CHECK-LABEL: @ossfuzz_14169_test2(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    store ptr undef, ptr undef, align 8
+; CHECK-NEXT:    store ptr poison, ptr undef, align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
index 9f8dc9e8ac234c..24ccad4e157b93 100644
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -41,8 +41,8 @@ declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
 
 define float @test_shrink_libcall_floor(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_floor(
-; CHECK-NEXT:    [[F:%.*]] = call float @llvm.floor.f32(float [[C:%.*]])
-; CHECK-NEXT:    ret float [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.floor.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %D = fpext float %C to double
   ; --> floorf
@@ -53,8 +53,8 @@ define float @test_shrink_libcall_floor(float %C) {
 
 define float @test_shrink_libcall_ceil(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_ceil(
-; CHECK-NEXT:    [[F:%.*]] = call float @llvm.ceil.f32(float [[C:%.*]])
-; CHECK-NEXT:    ret float [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.ceil.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %D = fpext float %C to double
   ; --> ceilf
@@ -65,8 +65,8 @@ define float @test_shrink_libcall_ceil(float %C) {
 
 define float @test_shrink_libcall_round(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_round(
-; CHECK-NEXT:    [[F:%.*]] = call float @llvm.round.f32(float [[C:%.*]])
-; CHECK-NEXT:    ret float [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.round.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %D = fpext float %C to double
   ; --> roundf
@@ -77,8 +77,8 @@ define float @test_shrink_libcall_round(float %C) {
 
 define float @test_shrink_libcall_roundeven(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_roundeven(
-; CHECK-NEXT:    [[F:%.*]] = call float @llvm.roundeven.f32(float [[C:%.*]])
-; CHECK-NEXT:    ret float [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.roundeven.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %D = fpext float %C to double
   ; --> roundeven
@@ -89,8 +89,8 @@ define float @test_shrink_libcall_roundeven(float %C) {
 
 define float @test_shrink_libcall_nearbyint(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_nearbyint(
-; CHECK-NEXT:    [[F:%.*]] = call float @llvm.nearbyint.f32(float [[C:%.*]])
-; CHECK-NEXT:    ret float [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.nearbyint.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %D = fpext float %C to double
   ; --> nearbyintf
@@ -101,8 +101,8 @@ define float @test_shrink_libcall_nearbyint(float %C) {
 
 define float @test_shrink_libcall_trunc(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_trunc(
-; CHECK-NEXT:    [[F:%.*]] = call float @llvm.trunc.f32(float [[C:%.*]])
-; CHECK-NEXT:    ret float [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.trunc.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %D = fpext float %C to double
   ; --> truncf
@@ -115,8 +115,8 @@ define float @test_shrink_libcall_trunc(float %C) {
 ; tested platforms.
 define float @test_shrink_libcall_fabs(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_fabs(
-; CHECK-NEXT:    [[F:%.*]] = call float @llvm.fabs.f32(float [[C:%.*]])
-; CHECK-NEXT:    ret float [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %D = fpext float %C to double
   %E = call double @fabs(double %D)
@@ -127,8 +127,8 @@ define float @test_shrink_libcall_fabs(float %C) {
 ; Make sure fast math flags are preserved
 define float @test_shrink_libcall_fabs_fast(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_fabs_fast(
-; CHECK-NEXT:    [[F:%.*]] = call fast float @llvm.fabs.f32(float [[C:%.*]])
-; CHECK-NEXT:    ret float [[F]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.fabs.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %D = fpext float %C to double
   %E = call fast double @fabs(double %D)
diff --git a/llvm/test/Transforms/InstCombine/ffs-i16.ll b/llvm/test/Transforms/InstCombine/ffs-i16.ll
index f86ae34e340fb1..f567a9488d4985 100644
--- a/llvm/test/Transforms/InstCombine/ffs-i16.ll
+++ b/llvm/test/Transforms/InstCombine/ffs-i16.ll
@@ -3,8 +3,8 @@
 ; Test that the ffs library call simplifier works correctly even for
 ; targets with 16-bit int.
 ;
-; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s --check-prefix=AVR
+; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
 
 declare i16 @ffs(i16)
 
@@ -12,15 +12,25 @@ declare void @sink(i16)
 
 
 define void @fold_ffs(i16 %x) {
-; CHECK-LABEL: @fold_ffs(
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 1)
-; CHECK-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[CTTZ]], 1
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[X]], 0
-; CHECK-NEXT:    [[NX:%.*]] = select i1 [[DOTNOT]], i16 0, i16 [[TMP1]]
-; CHECK-NEXT:    call void @sink(i16 [[NX]])
-; CHECK-NEXT:    ret void
+; AVR-LABEL: @fold_ffs(
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 1)
+; AVR-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
+; AVR-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[CTTZ]], 1
+; AVR-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[X]], 0
+; AVR-NEXT:    [[NX:%.*]] = select i1 [[DOTNOT]], i16 0, i16 [[TMP1]]
+; AVR-NEXT:    call void @sink(i16 [[NX]])
+; AVR-NEXT:    ret void
+;
+; MSP430-LABEL: @fold_ffs(
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 1)
+; MSP430-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
+; MSP430-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[CTTZ]], 1
+; MSP430-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[X]], 0
+; MSP430-NEXT:    [[NX:%.*]] = select i1 [[DOTNOT]], i16 0, i16 [[TMP1]]
+; MSP430-NEXT:    call void @sink(i16 [[NX]])
+; MSP430-NEXT:    ret void
 ;
   %n0 = call i16 @ffs(i16 0)
   call void @sink(i16 %n0)
diff --git a/llvm/test/Transforms/InstCombine/fls-i16.ll b/llvm/test/Transforms/InstCombine/fls-i16.ll
index e19b230d3163d5..ba0471665a4821 100644
--- a/llvm/test/Transforms/InstCombine/fls-i16.ll
+++ b/llvm/test/Transforms/InstCombine/fls-i16.ll
@@ -4,8 +4,8 @@
 ; targets with 16-bit int.  Although fls is available on a number of
 ; targets it's supported (hardcoded as available) only on FreeBSD.
 ;
-; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=AVR
+; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
 
 declare i16 @fls(i16)
 
@@ -13,13 +13,21 @@ declare void @sink(i16)
 
 
 define void @fold_fls(i16 %x) {
-; CHECK-LABEL: @fold_fls(
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 1)
-; CHECK-NEXT:    [[CTLZ:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[NX:%.*]] = sub nuw nsw i16 16, [[CTLZ]]
-; CHECK-NEXT:    call void @sink(i16 [[NX]])
-; CHECK-NEXT:    ret void
+; AVR-LABEL: @fold_fls(
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 1)
+; AVR-NEXT:    [[CTLZ:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
+; AVR-NEXT:    [[NX:%.*]] = sub nuw nsw i16 16, [[CTLZ]]
+; AVR-NEXT:    call void @sink(i16 [[NX]])
+; AVR-NEXT:    ret void
+;
+; MSP430-LABEL: @fold_fls(
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 1)
+; MSP430-NEXT:    [[CTLZ:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
+; MSP430-NEXT:    [[NX:%.*]] = sub nuw nsw i16 16, [[CTLZ]]
+; MSP430-NEXT:    call void @sink(i16 [[NX]])
+; MSP430-NEXT:    ret void
 ;
   %n0 = call i16 @fls(i16 0)
   call void @sink(i16 %n0)
diff --git a/llvm/test/Transforms/InstCombine/gep-custom-dl.ll b/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
index 41285c78f03edb..7be15e38d44a7c 100644
--- a/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
+++ b/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
@@ -168,7 +168,7 @@ define i32 @test10() {
 define i16 @constant_fold_custom_dl() {
 ; CHECK-LABEL: @constant_fold_custom_dl(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i16 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) getelementptr inbounds ([1000 x i8], ptr addrspace(1) @X_as1, i32 1, i32 0), i32 sext (i16 sub (i16 0, i16 ptrtoint (ptr addrspace(1) @X_as1 to i16)) to i32)) to i16)
+; CHECK-NEXT:    ret i16 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) getelementptr inbounds ([1000 x i8], ptr addrspace(1) @X_as1, i32 1, i32 0), i16 sub (i16 0, i16 ptrtoint (ptr addrspace(1) @X_as1 to i16))) to i16)
 ;
 
 entry:
diff --git a/llvm/test/Transforms/InstCombine/gepgep.ll b/llvm/test/Transforms/InstCombine/gepgep.ll
index 6a492a7173cd2e..d2a0e1db392ba0 100644
--- a/llvm/test/Transforms/InstCombine/gepgep.ll
+++ b/llvm/test/Transforms/InstCombine/gepgep.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -passes=instcombine -disable-output
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
@@ -8,6 +9,10 @@ target triple = "x86_64-unknown-linux-gnu"
 declare void @use(ptr)
 
 define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:    call void @use(ptr getelementptr (i8, ptr @buffer, i64 add (i64 sub (i64 0, i64 ptrtoint (ptr @buffer to i64)), i64 127)))
+; CHECK-NEXT:    ret void
+;
   call void @use(ptr getelementptr (i8, ptr getelementptr (i8, ptr @buffer, i64 add (i64 sub (i64 0, i64 ptrtoint (ptr @buffer to i64)), i64 63)), i64 64))
   ret void
 }
diff --git a/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll b/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll
index 3cea5aff7e704c..db2c8e2f22f6ee 100644
--- a/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll
+++ b/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll
@@ -94,9 +94,11 @@ entry:
 
 define i16 @constantexpr2() {
 ; CHECK-LABEL: @constantexpr2(
+; CHECK-NEXT:    [[I1:%.*]] = zext i1 icmp ne (ptr getelementptr inbounds ([6 x [1 x i64]], ptr @global_constant3, i64 0, i64 5, i64 0), ptr @global_constant4) to i16
 ; CHECK-NEXT:    [[I2:%.*]] = load ptr, ptr @global_constant5, align 1
 ; CHECK-NEXT:    [[I3:%.*]] = load i16, ptr [[I2]], align 1
-; CHECK-NEXT:    [[I5:%.*]] = xor i16 [[I3]], xor (i16 zext (i1 icmp ne (ptr getelementptr inbounds ([6 x [1 x i64]], ptr @global_constant3, i64 0, i64 5, i64 0), ptr @global_constant4) to i16), i16 -1)
+; CHECK-NEXT:    [[I4:%.*]] = xor i16 [[I3]], [[I1]]
+; CHECK-NEXT:    [[I5:%.*]] = xor i16 [[I4]], -1
 ; CHECK-NEXT:    ret i16 [[I5]]
 ;
   %i0 = icmp ne ptr getelementptr inbounds ([6 x [1 x i64]], ptr @global_constant3, i16 0, i16 5, i16 0), @global_constant4
diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll
index a2d0c3eb39d69e..832db40fda5f67 100644
--- a/llvm/test/Transforms/InstCombine/icmp-add.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-add.ll
@@ -5,6 +5,1707 @@ declare void @use(i32)
 
 ; PR1949
 
+; negative test for zext/zext additions with i16
+define i1 @cvt_icmp_0_zext_plus_zext_eq_i16(i16 %arg, i16 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_zext_eq_i16(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I:%.*]] = zext i16 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[I2:%.*]] = zext i16 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[I3:%.*]] = sub nsw i32 0, [[I]]
+; CHECK-NEXT:    [[I4:%.*]] = icmp eq i32 [[I2]], [[I3]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i16 %arg to i32
+  %i2 = zext i16 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp eq i32 %i3, 0
+  ret i1 %i4
+}
+
+; negative test for zext/zext addtions with i8
+define i1 @cvt_icmp_0_zext_plus_zext_eq_i8(i8 %arg, i8 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_zext_eq_i8(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I:%.*]] = zext i8 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[I2:%.*]] = zext i8 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[I3:%.*]] = sub nsw i32 0, [[I]]
+; CHECK-NEXT:    [[I4:%.*]] = icmp eq i32 [[I2]], [[I3]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i8 %arg to i32
+  %i2 = zext i8 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp eq i32 %i3, 0
+  ret i1 %i4
+}
+
+; start of positive tests
+define i1 @cvt_icmp_neg_2_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_2_zext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp eq i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_neg_1_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_1_zext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp eq i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_0_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp eq i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_1_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp eq i32 %i3, 1
+  ret i1 %i4
+}
+
+define <2 x i1> @cvt_icmp_1_zext_plus_zext_eq_vec(<2 x i1> %arg, <2 x i1> %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_eq_vec(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = xor <2 x i1> [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[I4]]
+;
+bb:
+  %i = zext <2 x i1> %arg to <2 x i32>
+  %i2 = zext <2 x i1> %arg1 to <2 x i32>
+  %i3 = add <2 x i32> %i2, %i
+  %i4 = icmp eq <2 x i32> %i3, <i32 1, i32 1>
+  ret <2 x i1> %i4
+}
+
+define i1 @cvt_icmp_2_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_2_zext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T:%.*]] = and i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_2_sext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_2_sext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T:%.*]] = and i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, -2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_1_sext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_1_sext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, -1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_0_sext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_sext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 0
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_1_sext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_sext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_2_sext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_2_sext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_2_sext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_2_sext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, -2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_1_sext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_1_sext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[T:%.*]] = and i1 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, -1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_0_sext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_sext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 0
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_1_sext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_sext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[T:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_2_sext_plus_zext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_2_sext_plus_zext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_2_zext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_2_zext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ne i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_neg_1_zext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_1_zext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ne i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_0_zext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ne i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_1_zext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ne i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_1_zext_plus_zext_ne_extra_use_1(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_ne_extra_use_1(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I:%.*]] = zext i1 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[I2:%.*]] = zext i1 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[I3:%.*]] = add nuw nsw i32 [[I2]], [[I]]
+; CHECK-NEXT:    call void @use(i32 [[I3]])
+; CHECK-NEXT:    [[I4:%.*]] = icmp ne i32 [[I3]], 1
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  call void @use(i32 %i3)
+  %i4 = icmp ne i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_1_zext_plus_zext_ne_extra_use_2(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_ne_extra_use_2(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I:%.*]] = zext i1 [[ARG:%.*]] to i32
+; CHECK-NEXT:    call void @use(i32 [[I]])
+; CHECK-NEXT:    [[I2:%.*]] = zext i1 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    call void @use(i32 [[I2]])
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1]], [[ARG]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  call void @use(i32 %i)
+  %i2 = zext i1 %arg1 to i32
+  call void @use(i32 %i2)
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ne i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @cvt_icmp_2_zext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_2_zext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_2_sext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_2_sext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, -2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_1_sext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_1_sext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, -1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_0_sext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_sext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T:%.*]] = or i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 0
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_1_sext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_sext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_2_sext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_2_sext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_2_sext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_2_sext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, -2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_1_sext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_1_sext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[T:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, -1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_0_sext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_sext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 0
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_1_sext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_sext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[T:%.*]] = or i1 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_2_sext_plus_zext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_2_sext_plus_zext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 2
+  ret i1 %t
+}
+
+; test if zext i1 X + sext i1 Y converted to sext i1 X + zext i1 Y
+; and then processed
+
+define i1 @cvt_icmp_neg_2_zext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_2_zext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, -2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_1_zext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_1_zext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[T:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, -1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_0_zext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 0
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_1_zext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_zext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[T:%.*]] = and i1 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_2_zext_plus_sext_eq(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_2_zext_plus_sext_eq(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp eq i32 %i3, 2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_2_zext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_2_zext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, -2
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_neg_1_zext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_neg_1_zext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[T:%.*]] = or i1 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, -1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_0_zext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T:%.*]] = xor i1 [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 0
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_1_zext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_1_zext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[T:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 1
+  ret i1 %t
+}
+
+define i1 @cvt_icmp_2_zext_plus_sext_ne(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @cvt_icmp_2_zext_plus_sext_ne(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i, %i2
+  %t = icmp ne i32 %i3, 2
+  ret i1 %t
+}
+
+; test zext/zext additions with more than one use
+
+define i1 @test_cvt_icmp1(i1 %arg, i1 %arg1, ptr %p) {
+; CHECK-LABEL: @test_cvt_icmp1(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I2:%.*]] = zext i1 [[ARG:%.*]] to i32
+; CHECK-NEXT:    store i32 [[I2]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg to i32
+  store i32 %i2, ptr %p
+  %i3 = load i32, ptr %p
+  %i4 = add i32 %i3, %i
+  %t = icmp eq i32 %i4, 1
+  ret i1 %t
+}
+
+define i1 @test_cvt_icmp2(i1 %arg, i1 %arg1, ptr %p) {
+; CHECK-LABEL: @test_cvt_icmp2(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I2:%.*]] = zext i1 [[ARG:%.*]] to i32
+; CHECK-NEXT:    store i32 [[I2]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg to i32
+  store i32 %i2, ptr %p
+  %i3 = load i32, ptr %p
+  %i4 = add i32 %i3, %i
+  %t = icmp eq i32 %i4, 1
+  ret i1 %t
+}
+
+; tests for ult
+define i1 @test_zext_zext_cvt_neg_2_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_neg_2_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_neg_1_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_neg_1_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_0_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_0_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_2_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_2_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_neg_2_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_neg_2_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_neg_1_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_neg_1_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_0_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_0_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_1_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_1_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_2_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_2_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_sext_zext_cvt_neg_2_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_zext_cvt_neg_2_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ARG_NOT:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG_NOT]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_sext_zext_cvt_neg_1_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_zext_cvt_neg_1_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_sext_zext_cvt_0_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_zext_cvt_0_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_sext_zext_cvt_2_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_zext_cvt_2_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ARG_NOT:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG_NOT]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_neg_1_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_neg_1_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_0_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_0_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_1_ult_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_1_ult_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 1
+  ret i1 %i4
+}
+
+; tests for ugt
+define i1 @test_cvt_icmp4(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp4(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_neg_2_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_neg_2_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_1_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_1_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = and i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_2_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_2_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_neg_2_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_neg_2_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_0_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_0_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_2_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_2_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_neg_2_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_neg_2_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_neg_1_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_neg_1_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_0_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_0_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_1_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_1_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_2_ugt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_2_ugt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp5(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp5(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp uge i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp6(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp6(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ule i32 %i3, 1
+  ret i1 %i4
+}
+
+; tests for sgt
+define i1 @test_cvt_icmp7(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp7(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = and i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_neg_2_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_neg_2_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_neg_1_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_neg_1_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_2_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_2_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_neg_2_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_neg_2_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_0_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_0_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_2_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_2_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_neg_2_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_neg_2_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_neg_1_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_neg_1_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ARG1_NOT:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1_NOT]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_0_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_0_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = and i1 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_1_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_1_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_2_sgt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_2_sgt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 2
+  ret i1 %i4
+}
+
+; tests for slt
+define i1 @test_zext_zext_cvt_neg_2_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_neg_2_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_neg_1_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_neg_1_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_zext_zext_cvt_2_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_zext_cvt_2_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_neg_2_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_neg_2_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_0_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_0_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[TMP0]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_sext_sext_cvt_2_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_sext_sext_cvt_2_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_neg_2_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_neg_2_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, -2
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_neg_1_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_neg_1_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, -1
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_0_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_0_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_1_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_1_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_zext_sext_cvt_2_slt_icmp(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_zext_sext_cvt_2_slt_icmp(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp8(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp8(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sge i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp9(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp9(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp10(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp10(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = zext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sle i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp11(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp11(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp12(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp12(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp uge i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp13(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp13(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp14(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp14(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ule i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp15(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp15(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp16(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp16(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sge i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp17(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp17(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp18(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp18(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = sext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sle i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp19(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp19(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = and i1 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ugt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp20(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp20(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I4:%.*]] = xor i1 [[ARG1:%.*]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp uge i32 %i3, 1
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp21(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp21(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ARG_NOT:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG_NOT]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ult i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp22(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp22(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp ule i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp23(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp23(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 false
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sgt i32 %i3, 2
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp24(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp24(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ARG_NOT:%.*]] = xor i1 [[ARG:%.*]], true
+; CHECK-NEXT:    [[I4:%.*]] = or i1 [[ARG_NOT]], [[ARG1:%.*]]
+; CHECK-NEXT:    ret i1 [[I4]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sge i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp25(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp25(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = and i1 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp slt i32 %i3, 0
+  ret i1 %i4
+}
+
+define i1 @test_cvt_icmp26(i1 %arg, i1 %arg1) {
+; CHECK-LABEL: @test_cvt_icmp26(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 true
+;
+bb:
+  %i = sext i1 %arg to i32
+  %i2 = zext i1 %arg1 to i32
+  %i3 = add i32 %i2, %i
+  %i4 = icmp sle i32 %i3, 1
+  ret i1 %i4
+}
+
 define i1 @test1(i32 %a) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[A:%.*]], -5
diff --git a/llvm/test/Transforms/InstCombine/isascii-i16.ll b/llvm/test/Transforms/InstCombine/isascii-i16.ll
index ef5d8c75d80c82..2cac57e09c37eb 100644
--- a/llvm/test/Transforms/InstCombine/isascii-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isascii-i16.ll
@@ -2,8 +2,8 @@
 ; Test that the isascii library call simplifier works correctly even for
 ; targets with 16-bit int.
 ;
-; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=AVR
+; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
 
 declare i16 @isascii(i16)
 
@@ -11,19 +11,33 @@ declare void @sink(i16)
 
 
 define void @fold_isascii(i16 %c) {
-; CHECK-LABEL: @fold_isascii(
-; CHECK-NEXT:    call void @sink(i16 1)
-; CHECK-NEXT:    call void @sink(i16 1)
-; CHECK-NEXT:    call void @sink(i16 1)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    [[ISASCII:%.*]] = icmp ult i16 [[C:%.*]], 128
-; CHECK-NEXT:    [[IC:%.*]] = zext i1 [[ISASCII]] to i16
-; CHECK-NEXT:    call void @sink(i16 [[IC]])
-; CHECK-NEXT:    ret void
+; AVR-LABEL: @fold_isascii(
+; AVR-NEXT:    call void @sink(i16 1)
+; AVR-NEXT:    call void @sink(i16 1)
+; AVR-NEXT:    call void @sink(i16 1)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    [[ISASCII:%.*]] = icmp ult i16 [[C:%.*]], 128
+; AVR-NEXT:    [[IC:%.*]] = zext i1 [[ISASCII]] to i16
+; AVR-NEXT:    call void @sink(i16 [[IC]])
+; AVR-NEXT:    ret void
+;
+; MSP430-LABEL: @fold_isascii(
+; MSP430-NEXT:    call void @sink(i16 1)
+; MSP430-NEXT:    call void @sink(i16 1)
+; MSP430-NEXT:    call void @sink(i16 1)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    [[ISASCII:%.*]] = icmp ult i16 [[C:%.*]], 128
+; MSP430-NEXT:    [[IC:%.*]] = zext i1 [[ISASCII]] to i16
+; MSP430-NEXT:    call void @sink(i16 [[IC]])
+; MSP430-NEXT:    ret void
 ;
   %i0 = call i16 @isascii(i16 0)
   call void @sink(i16 %i0)
diff --git a/llvm/test/Transforms/InstCombine/isdigit-i16.ll b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
index 7e63798ca438cb..69f7b7f2e1f702 100644
--- a/llvm/test/Transforms/InstCombine/isdigit-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
@@ -2,32 +2,51 @@
 ; Test that the isdigit library call simplifier works correctly even for
 ; targets with 16-bit int.
 ;
-; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s --check-prefix=AVR
+; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s  --check-prefix=MSP430
 
 declare i16 @isdigit(i16)
 
 declare void @sink(i16)
 
 define void @fold_isdigit(i16 %c) {
-; CHECK-LABEL: @fold_isdigit(
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 1)
-; CHECK-NEXT:    call void @sink(i16 1)
-; CHECK-NEXT:    call void @sink(i16 1)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    call void @sink(i16 0)
-; CHECK-NEXT:    [[ISDIGITTMP:%.*]] = add i16 [[C:%.*]], -48
-; CHECK-NEXT:    [[ISDIGIT:%.*]] = icmp ult i16 [[ISDIGITTMP]], 10
-; CHECK-NEXT:    [[IC:%.*]] = zext i1 [[ISDIGIT]] to i16
-; CHECK-NEXT:    call void @sink(i16 [[IC]])
-; CHECK-NEXT:    ret void
+; AVR-LABEL: @fold_isdigit(
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 1)
+; AVR-NEXT:    call void @sink(i16 1)
+; AVR-NEXT:    call void @sink(i16 1)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    [[ISDIGITTMP:%.*]] = add i16 [[C:%.*]], -48
+; AVR-NEXT:    [[ISDIGIT:%.*]] = icmp ult i16 [[ISDIGITTMP]], 10
+; AVR-NEXT:    [[IC:%.*]] = zext i1 [[ISDIGIT]] to i16
+; AVR-NEXT:    call void @sink(i16 [[IC]])
+; AVR-NEXT:    ret void
+;
+; MSP430-LABEL: @fold_isdigit(
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 1)
+; MSP430-NEXT:    call void @sink(i16 1)
+; MSP430-NEXT:    call void @sink(i16 1)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    call void @sink(i16 0)
+; MSP430-NEXT:    [[ISDIGITTMP:%.*]] = add i16 [[C:%.*]], -48
+; MSP430-NEXT:    [[ISDIGIT:%.*]] = icmp ult i16 [[ISDIGITTMP]], 10
+; MSP430-NEXT:    [[IC:%.*]] = zext i1 [[ISDIGIT]] to i16
+; MSP430-NEXT:    call void @sink(i16 [[IC]])
+; MSP430-NEXT:    ret void
 ;
   %i0 = call i16 @isdigit(i16 0)
   call void @sink(i16 %i0)
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index a5cab8a71d3f71..710c99033394bd 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -837,6 +837,60 @@ end:
   ret i1 %z
 }
 
+; Same as above, but the input is also a phi
+define i1 @test25b(i1 %ci, i64 %ai, i64 %bi) {
+; CHECK-LABEL: @test25b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CI:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[AI:%.*]], [[ENTRY:%.*]] ], [ [[BI:%.*]], [[THEN]] ]
+; CHECK-NEXT:    [[B:%.*]] = call i1 @test25a()
+; CHECK-NEXT:    br i1 [[B]], label [[ONE:%.*]], label [[TWO:%.*]]
+; CHECK:       one:
+; CHECK-NEXT:    [[C:%.*]] = call i1 @test25a()
+; CHECK-NEXT:    br i1 [[C]], label [[TWO]], label [[END:%.*]]
+; CHECK:       two:
+; CHECK-NEXT:    [[D:%.*]] = call i1 @test25a()
+; CHECK-NEXT:    br i1 [[D]], label [[ONE]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[G:%.*]] = inttoptr i64 [[I]] to ptr
+; CHECK-NEXT:    store i32 10, ptr [[G]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = call i1 @test25a()
+; CHECK-NEXT:    ret i1 [[Z]]
+;
+entry:
+  br i1 %ci, label %then, label %else
+
+then:
+  br label %else
+
+else:
+  %i = phi i64 [ %ai, %entry ], [ %bi, %then ]
+  %b = call i1 @test25a()
+  br i1 %b, label %one, label %two
+
+one:
+  %x = phi i64 [%y, %two], [%i, %else]
+  %c = call i1 @test25a()
+  br i1 %c, label %two, label %end
+
+two:
+  %y = phi i64 [%x, %one], [%i, %else]
+  %d = call i1 @test25a()
+  br i1 %d, label %one, label %end
+
+end:
+  %f = phi i64 [ %x, %one], [%y, %two]
+  ; Change the %f to %i, and the optimizer suddenly becomes a lot smarter
+  ; even though %f must equal %i at this point
+  %g = inttoptr i64 %f to ptr
+  store i32 10, ptr %g
+  %z = call i1 @test25a()
+  ret i1 %z
+}
+
 declare i1 @test26a()
 
 define i1 @test26(i32 %n) {
diff --git a/llvm/test/Transforms/InstCombine/pow-4.ll b/llvm/test/Transforms/InstCombine/pow-4.ll
index 25741eb18f06f9..dbe6477c0ecaaf 100644
--- a/llvm/test/Transforms/InstCombine/pow-4.ll
+++ b/llvm/test/Transforms/InstCombine/pow-4.ll
@@ -135,17 +135,17 @@ define double @test_simplify_33(double %x) {
 
 ; pow(x, 16.5) with double
 define double @test_simplify_16_5(double %x) {
-; CHECK32-LABEL: @test_simplify_16_5(
-; CHECK32-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[X:%.*]])
-; CHECK32-NEXT:    [[POWI:%.*]] = call fast double @llvm.powi.f64.i32(double [[X]], i32 16)
-; CHECK32-NEXT:    [[TMP1:%.*]] = fmul fast double [[POWI]], [[SQRT]]
-; CHECK32-NEXT:    ret double [[TMP1]]
-;
-; CHECK16-LABEL: @test_simplify_16_5(
-; CHECK16-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[X:%.*]])
-; CHECK16-NEXT:    [[POWI:%.*]] = call fast double @llvm.powi.f64.i16(double [[X]], i16 16)
-; CHECK16-NEXT:    [[TMP1:%.*]] = fmul fast double [[POWI]], [[SQRT]]
-; CHECK16-NEXT:    ret double [[TMP1]]
+; CHECKI32-LABEL: @test_simplify_16_5(
+; CHECKI32-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[X:%.*]])
+; CHECKI32-NEXT:    [[TMP1:%.*]] = call fast double @llvm.powi.f64.i32(double [[X]], i32 16)
+; CHECKI32-NEXT:    [[TMP2:%.*]] = fmul fast double [[TMP1]], [[SQRT]]
+; CHECKI32-NEXT:    ret double [[TMP2]]
+;
+; CHECKI16-LABEL: @test_simplify_16_5(
+; CHECKI16-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[X:%.*]])
+; CHECKI16-NEXT:    [[TMP1:%.*]] = call fast double @llvm.powi.f64.i16(double [[X]], i16 16)
+; CHECKI16-NEXT:    [[TMP2:%.*]] = fmul fast double [[TMP1]], [[SQRT]]
+; CHECKI16-NEXT:    ret double [[TMP2]]
 ;
   %1 = call fast double @llvm.pow.f64(double %x, double 1.650000e+01)
   ret double %1
@@ -153,17 +153,17 @@ define double @test_simplify_16_5(double %x) {
 
 ; pow(x, -16.5) with double
 define double @test_simplify_neg_16_5(double %x) {
-; CHECK32-LABEL: @test_simplify_neg_16_5(
-; CHECK32-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[X:%.*]])
-; CHECK32-NEXT:    [[POWI:%.*]] = call fast double @llvm.powi.f64.i32(double [[X]], i32 -17)
-; CHECK32-NEXT:    [[TMP1:%.*]] = fmul fast double [[POWI]], [[SQRT]]
-; CHECK32-NEXT:    ret double [[TMP1]]
-;
-; CHECK16-LABEL: @test_simplify_neg_16_5(
-; CHECK16-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[X:%.*]])
-; CHECK16-NEXT:    [[POWI:%.*]] = call fast double @llvm.powi.f64.i16(double [[X]], i16 -17)
-; CHECK16-NEXT:    [[TMP1:%.*]] = fmul fast double [[POWI]], [[SQRT]]
-; CHECK16-NEXT:    ret double [[TMP1]]
+; CHECKI32-LABEL: @test_simplify_neg_16_5(
+; CHECKI32-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[X:%.*]])
+; CHECKI32-NEXT:    [[TMP1:%.*]] = call fast double @llvm.powi.f64.i32(double [[X]], i32 -17)
+; CHECKI32-NEXT:    [[TMP2:%.*]] = fmul fast double [[TMP1]], [[SQRT]]
+; CHECKI32-NEXT:    ret double [[TMP2]]
+;
+; CHECKI16-LABEL: @test_simplify_neg_16_5(
+; CHECKI16-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[X:%.*]])
+; CHECKI16-NEXT:    [[TMP1:%.*]] = call fast double @llvm.powi.f64.i16(double [[X]], i16 -17)
+; CHECKI16-NEXT:    [[TMP2:%.*]] = fmul fast double [[TMP1]], [[SQRT]]
+; CHECKI16-NEXT:    ret double [[TMP2]]
 ;
   %1 = call fast double @llvm.pow.f64(double %x, double -1.650000e+01)
   ret double %1
@@ -202,15 +202,17 @@ define double @test_simplify_neg_0_5_libcall(double %x) {
 
 ; pow(x, -8.5) with float
 define float @test_simplify_neg_8_5(float %x) {
-; CHECK32-LABEL: @test_simplify_neg_8_5(
-; CHECK32-NEXT:    [[SQRT:%.*]] = call fast float @llvm.sqrt.f32(float [[X:%.*]])
-; CHECK32-NEXT:    [[POWI:%.*]] = call fast float @llvm.powi.f32.i32(float [[X]], i32 -9)
-; CHECK32-NEXT:    [[TMP1:%.*]] = fmul fast float [[POWI]], [[SQRT]]
-;
-; CHECK16-LABEL: @test_simplify_neg_8_5(
-; CHECK16-NEXT:    [[SQRT:%.*]] = call fast float @llvm.sqrt.f32(float [[X:%.*]])
-; CHECK16-NEXT:    [[POWI:%.*]] = call fast float @llvm.powi.f32.i16(float [[X]], i16 -9)
-; CHECK16-NEXT:    [[TMP1:%.*]] = fmul fast float [[POWI]], [[SQRT]]
+; CHECKI32-LABEL: @test_simplify_neg_8_5(
+; CHECKI32-NEXT:    [[SQRT:%.*]] = call fast float @llvm.sqrt.f32(float [[X:%.*]])
+; CHECKI32-NEXT:    [[TMP1:%.*]] = call fast float @llvm.powi.f32.i32(float [[X]], i32 -9)
+; CHECKI32-NEXT:    [[TMP2:%.*]] = fmul fast float [[TMP1]], [[SQRT]]
+; CHECKI32-NEXT:    ret float [[TMP2]]
+;
+; CHECKI16-LABEL: @test_simplify_neg_8_5(
+; CHECKI16-NEXT:    [[SQRT:%.*]] = call fast float @llvm.sqrt.f32(float [[X:%.*]])
+; CHECKI16-NEXT:    [[TMP1:%.*]] = call fast float @llvm.powi.f32.i16(float [[X]], i16 -9)
+; CHECKI16-NEXT:    [[TMP2:%.*]] = fmul fast float [[TMP1]], [[SQRT]]
+; CHECKI16-NEXT:    ret float [[TMP2]]
 ;
   %1 = call fast float @llvm.pow.f32(float %x, float -0.850000e+01)
   ret float %1
@@ -218,15 +220,17 @@ define float @test_simplify_neg_8_5(float %x) {
 
 ; pow(x, 7.5) with <2 x double>
 define <2 x double> @test_simplify_7_5(<2 x double> %x) {
-; CHECK32-LABEL: @test_simplify_7_5(
-; CHECK32-NEXT:    [[SQRT:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[X:%.*]])
-; CHECK32-NEXT:    [[POWI:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[X]], i32 7)
-; CHECK32-NEXT:    [[TMP1:%.*]] = fmul fast <2 x double> [[POWI]], [[SQRT]]
-;
-; CHECK16-LABEL: @test_simplify_7_5(
-; CHECK16-NEXT:    [[SQRT:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[X:%.*]])
-; CHECK16-NEXT:    [[POWI:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i16(<2 x double> [[X]], i16 7)
-; CHECK16-NEXT:    [[TMP1:%.*]] = fmul fast <2 x double> [[POWI]], [[SQRT]]
+; CHECKI32-LABEL: @test_simplify_7_5(
+; CHECKI32-NEXT:    [[SQRT:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[X:%.*]])
+; CHECKI32-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[X]], i32 7)
+; CHECKI32-NEXT:    [[TMP2:%.*]] = fmul fast <2 x double> [[TMP1]], [[SQRT]]
+; CHECKI32-NEXT:    ret <2 x double> [[TMP2]]
+;
+; CHECKI16-LABEL: @test_simplify_7_5(
+; CHECKI16-NEXT:    [[SQRT:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[X:%.*]])
+; CHECKI16-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.powi.v2f64.i16(<2 x double> [[X]], i16 7)
+; CHECKI16-NEXT:    [[TMP2:%.*]] = fmul fast <2 x double> [[TMP1]], [[SQRT]]
+; CHECKI16-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %1 = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %x, <2 x double> <double 7.500000e+00, double 7.500000e+00>)
   ret <2 x double> %1
@@ -234,15 +238,17 @@ define <2 x double> @test_simplify_7_5(<2 x double> %x) {
 
 ; pow(x, 3.5) with <4 x float>
 define <4 x float> @test_simplify_3_5(<4 x float> %x) {
-; CHECK32-LABEL: @test_simplify_3_5(
-; CHECK32-NEXT:    [[SQRT:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[X:%.*]])
-; CHECK32-NEXT:    [[POWI:%.*]] = call fast <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[X]], i32 3)
-; CHECK32-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[POWI]], [[SQRT]]
-;
-; CHECK16-LABEL: @test_simplify_3_5(
-; CHECK16-NEXT:    [[SQRT:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[X:%.*]])
-; CHECK16-NEXT:    [[POWI:%.*]] = call fast <4 x float> @llvm.powi.v4f32.i16(<4 x float> [[X]], i16 3)
-; CHECK16-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[POWI]], [[SQRT]]
+; CHECKI32-LABEL: @test_simplify_3_5(
+; CHECKI32-NEXT:    [[SQRT:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[X:%.*]])
+; CHECKI32-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[X]], i32 3)
+; CHECKI32-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[SQRT]]
+; CHECKI32-NEXT:    ret <4 x float> [[TMP2]]
+;
+; CHECKI16-LABEL: @test_simplify_3_5(
+; CHECKI16-NEXT:    [[SQRT:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[X:%.*]])
+; CHECKI16-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.powi.v4f32.i16(<4 x float> [[X]], i16 3)
+; CHECKI16-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[SQRT]]
+; CHECKI16-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %1 = call fast <4 x float> @llvm.pow.v4f32(<4 x float> %x, <4 x float> <float 3.500000e+00, float 3.500000e+00, float 3.500000e+00, float 3.500000e+00>)
   ret <4 x float> %1
diff --git a/llvm/test/Transforms/InstCombine/pow_fp_int.ll b/llvm/test/Transforms/InstCombine/pow_fp_int.ll
index 8718df3e86638a..e4d74f6bacb06a 100644
--- a/llvm/test/Transforms/InstCombine/pow_fp_int.ll
+++ b/llvm/test/Transforms/InstCombine/pow_fp_int.ll
@@ -1,12 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple unknown -passes=instcombine -S < %s | FileCheck %s
 
 ; PR42190
 ; Can't generate test checks due to PR42740.
 
 define double @pow_sitofp_const_base_fast(i32 %x) {
-; CHECK-LABEL: @pow_sitofp_const_base_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[X:%.*]])
-; CHECK-NEXT:    [[RES:%.*]] = fpext float [[TMP1]] to double
+; CHECK-LABEL: define double @pow_sitofp_const_base_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[X]])
+; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = sitofp i32 %x to float
@@ -16,10 +18,11 @@ define double @pow_sitofp_const_base_fast(i32 %x) {
 }
 
 define double @pow_uitofp_const_base_fast(i31 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
-; CHECK-NEXT:    [[RES:%.*]] = fpext float [[TMP2]] to double
+; CHECK-LABEL: define double @pow_uitofp_const_base_fast(
+; CHECK-SAME: i31 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X]] to i32
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = uitofp i31 %x to float
@@ -29,9 +32,10 @@ define double @pow_uitofp_const_base_fast(i31 %x) {
 }
 
 define double @pow_sitofp_double_const_base_fast(i32 %x) {
-; CHECK-LABEL: @pow_sitofp_double_const_base_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call afn double @llvm.powi.f64.i32(double 7.000000e+00, i32 [[X:%.*]])
-; CHECK-NEXT:    ret double [[TMP1]]
+; CHECK-LABEL: define double @pow_sitofp_double_const_base_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn double @llvm.powi.f64.i32(double 7.000000e+00, i32 [[X]])
+; CHECK-NEXT:    ret double [[POW]]
 ;
   %subfp = sitofp i32 %x to double
   %pow = tail call afn double @llvm.pow.f64(double 7.000000e+00, double %subfp)
@@ -39,10 +43,11 @@ define double @pow_sitofp_double_const_base_fast(i32 %x) {
 }
 
 define double @pow_uitofp_double_const_base_fast(i31 %x) {
-; CHECK-LABEL: @pow_uitofp_double_const_base_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call afn double @llvm.powi.f64.i32(double 7.000000e+00, i32 [[TMP1]])
-; CHECK-NEXT:    ret double [[TMP2]]
+; CHECK-LABEL: define double @pow_uitofp_double_const_base_fast(
+; CHECK-SAME: i31 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X]] to i32
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn double @llvm.powi.f64.i32(double 7.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    ret double [[POW]]
 ;
   %subfp = uitofp i31 %x to double
   %pow = tail call afn double @llvm.pow.f64(double 7.000000e+00, double %subfp)
@@ -50,8 +55,9 @@ define double @pow_uitofp_double_const_base_fast(i31 %x) {
 }
 
 define double @pow_sitofp_double_const_base_2_fast(i32 %x) {
-; CHECK-LABEL: @pow_sitofp_double_const_base_2_fast(
-; CHECK-NEXT:    [[LDEXPF:%.*]] = tail call afn float @ldexpf(float 1.000000e+00, i32 [[X:%.*]])
+; CHECK-LABEL: define double @pow_sitofp_double_const_base_2_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[LDEXPF:%.*]] = tail call afn float @ldexpf(float 1.000000e+00, i32 [[X]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[LDEXPF]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
@@ -62,8 +68,9 @@ define double @pow_sitofp_double_const_base_2_fast(i32 %x) {
 }
 
 define double @pow_sitofp_double_const_base_power_of_2_fast(i32 %x) {
-; CHECK-LABEL: @pow_sitofp_double_const_base_power_of_2_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_sitofp_double_const_base_power_of_2_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X]] to float
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul afn float [[SUBFP]], 4.000000e+00
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call afn float @llvm.exp2.f32(float [[MUL]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
@@ -76,8 +83,9 @@ define double @pow_sitofp_double_const_base_power_of_2_fast(i32 %x) {
 }
 
 define double @pow_uitofp_const_base_2_fast(i31 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_2_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X:%.*]] to i32
+; CHECK-LABEL: define double @pow_uitofp_const_base_2_fast(
+; CHECK-SAME: i31 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X]] to i32
 ; CHECK-NEXT:    [[LDEXPF:%.*]] = tail call afn float @ldexpf(float 1.000000e+00, i32 [[TMP1]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[LDEXPF]] to double
 ; CHECK-NEXT:    ret double [[RES]]
@@ -89,8 +97,9 @@ define double @pow_uitofp_const_base_2_fast(i31 %x) {
 }
 
 define double @pow_uitofp_const_base_power_of_2_fast(i31 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_power_of_2_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i31 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_uitofp_const_base_power_of_2_fast(
+; CHECK-SAME: i31 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i31 [[X]] to float
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul afn float [[SUBFP]], 4.000000e+00
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call afn float @llvm.exp2.f32(float [[MUL]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
@@ -103,9 +112,10 @@ define double @pow_uitofp_const_base_power_of_2_fast(i31 %x) {
 }
 
 define double @pow_sitofp_float_base_fast(float %base, i32 %x) {
-; CHECK-LABEL: @pow_sitofp_float_base_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call afn float @llvm.powi.f32.i32(float [[BASE:%.*]], i32 [[X:%.*]])
-; CHECK-NEXT:    [[RES:%.*]] = fpext float [[TMP1]] to double
+; CHECK-LABEL: define double @pow_sitofp_float_base_fast(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn float @llvm.powi.f32.i32(float [[BASE]], i32 [[X]])
+; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = sitofp i32 %x to float
@@ -115,10 +125,11 @@ define double @pow_sitofp_float_base_fast(float %base, i32 %x) {
 }
 
 define double @pow_uitofp_float_base_fast(float %base, i31 %x) {
-; CHECK-LABEL: @pow_uitofp_float_base_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call afn float @llvm.powi.f32.i32(float [[BASE:%.*]], i32 [[TMP1]])
-; CHECK-NEXT:    [[RES:%.*]] = fpext float [[TMP2]] to double
+; CHECK-LABEL: define double @pow_uitofp_float_base_fast(
+; CHECK-SAME: float [[BASE:%.*]], i31 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X]] to i32
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn float @llvm.powi.f32.i32(float [[BASE]], i32 [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = uitofp i31 %x to float
@@ -128,9 +139,10 @@ define double @pow_uitofp_float_base_fast(float %base, i31 %x) {
 }
 
 define double @pow_sitofp_double_base_fast(double %base, i32 %x) {
-; CHECK-LABEL: @pow_sitofp_double_base_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call afn double @llvm.powi.f64.i32(double [[BASE:%.*]], i32 [[X:%.*]])
-; CHECK-NEXT:    ret double [[TMP1]]
+; CHECK-LABEL: define double @pow_sitofp_double_base_fast(
+; CHECK-SAME: double [[BASE:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = tail call afn double @llvm.powi.f64.i32(double [[BASE]], i32 [[X]])
+; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = sitofp i32 %x to double
   %res = tail call afn double @llvm.pow.f64(double %base, double %subfp)
@@ -138,10 +150,11 @@ define double @pow_sitofp_double_base_fast(double %base, i32 %x) {
 }
 
 define double @pow_uitofp_double_base_fast(double %base, i31 %x) {
-; CHECK-LABEL: @pow_uitofp_double_base_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call afn double @llvm.powi.f64.i32(double [[BASE:%.*]], i32 [[TMP1]])
-; CHECK-NEXT:    ret double [[TMP2]]
+; CHECK-LABEL: define double @pow_uitofp_double_base_fast(
+; CHECK-SAME: double [[BASE:%.*]], i31 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i31 [[X]] to i32
+; CHECK-NEXT:    [[RES:%.*]] = tail call afn double @llvm.powi.f64.i32(double [[BASE]], i32 [[TMP1]])
+; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = uitofp i31 %x to double
   %res = tail call afn double @llvm.pow.f64(double %base, double %subfp)
@@ -149,10 +162,11 @@ define double @pow_uitofp_double_base_fast(double %base, i31 %x) {
 }
 
 define double @pow_sitofp_const_base_fast_i8(i8 %x) {
-; CHECK-LABEL: @pow_sitofp_const_base_fast_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
-; CHECK-NEXT:    [[RES:%.*]] = fpext float [[TMP2]] to double
+; CHECK-LABEL: define double @pow_sitofp_const_base_fast_i8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = sitofp i8 %x to float
@@ -162,10 +176,11 @@ define double @pow_sitofp_const_base_fast_i8(i8 %x) {
 }
 
 define double @pow_sitofp_const_base_fast_i16(i16 %x) {
-; CHECK-LABEL: @pow_sitofp_const_base_fast_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
-; CHECK-NEXT:    [[RES:%.*]] = fpext float [[TMP2]] to double
+; CHECK-LABEL: define double @pow_sitofp_const_base_fast_i16(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = sitofp i16 %x to float
@@ -176,10 +191,11 @@ define double @pow_sitofp_const_base_fast_i16(i16 %x) {
 
 
 define double @pow_uitofp_const_base_fast_i8(i8 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_fast_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
-; CHECK-NEXT:    [[RES:%.*]] = fpext float [[TMP2]] to double
+; CHECK-LABEL: define double @pow_uitofp_const_base_fast_i8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = uitofp i8 %x to float
@@ -189,10 +205,11 @@ define double @pow_uitofp_const_base_fast_i8(i8 %x) {
 }
 
 define double @pow_uitofp_const_base_fast_i16(i16 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_fast_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
-; CHECK-NEXT:    [[RES:%.*]] = fpext float [[TMP2]] to double
+; CHECK-LABEL: define double @pow_uitofp_const_base_fast_i16(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X]] to i32
+; CHECK-NEXT:    [[POW:%.*]] = tail call afn float @llvm.powi.f32.i32(float 7.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = uitofp i16 %x to float
@@ -202,18 +219,20 @@ define double @pow_uitofp_const_base_fast_i16(i16 %x) {
 }
 
 define double @powf_exp_const_int_fast(double %base) {
-; CHECK-LABEL: @powf_exp_const_int_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[BASE:%.*]], i32 40)
-; CHECK-NEXT:    ret double [[TMP1]]
+; CHECK-LABEL: define double @powf_exp_const_int_fast(
+; CHECK-SAME: double [[BASE:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[BASE]], i32 40)
+; CHECK-NEXT:    ret double [[RES]]
 ;
   %res = tail call fast double @llvm.pow.f64(double %base, double 4.000000e+01)
   ret double %res
 }
 
 define double @powf_exp_const2_int_fast(double %base) {
-; CHECK-LABEL: @powf_exp_const2_int_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[BASE:%.*]], i32 -40)
-; CHECK-NEXT:    ret double [[TMP1]]
+; CHECK-LABEL: define double @powf_exp_const2_int_fast(
+; CHECK-SAME: double [[BASE:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[BASE]], i32 -40)
+; CHECK-NEXT:    ret double [[RES]]
 ;
   %res = tail call fast double @llvm.pow.f64(double %base, double -4.000000e+01)
   ret double %res
@@ -222,9 +241,10 @@ define double @powf_exp_const2_int_fast(double %base) {
 ; Negative tests
 
 define double @pow_uitofp_const_base_fast_i32(i32 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_fast_i32(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to float
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[SUBFP]], 0x4006757{{.*}}
+; CHECK-LABEL: define double @pow_uitofp_const_base_fast_i32(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to float
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[SUBFP]], 0x4006757680000000
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[MUL]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
 ; CHECK-NEXT:    ret double [[RES]]
@@ -236,8 +256,9 @@ define double @pow_uitofp_const_base_fast_i32(i32 %x) {
 }
 
 define double @pow_uitofp_const_base_2_fast_i32(i32 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_2_fast_i32(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_uitofp_const_base_2_fast_i32(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to float
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[SUBFP]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
 ; CHECK-NEXT:    ret double [[RES]]
@@ -249,8 +270,9 @@ define double @pow_uitofp_const_base_2_fast_i32(i32 %x) {
 }
 
 define double @pow_uitofp_const_base_power_of_2_fast_i32(i32 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_power_of_2_fast_i32(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_uitofp_const_base_power_of_2_fast_i32(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to float
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[SUBFP]], 4.000000e+00
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[MUL]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
@@ -263,9 +285,10 @@ define double @pow_uitofp_const_base_power_of_2_fast_i32(i32 %x) {
 }
 
 define double @pow_uitofp_float_base_fast_i32(float %base, i32 %x) {
-; CHECK-LABEL: @pow_uitofp_float_base_fast_i32(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to float
-; CHECK-NEXT:    [[POW:%.*]] = tail call fast float @llvm.pow.f32(float [[BASE:%.*]], float [[SUBFP]])
+; CHECK-LABEL: define double @pow_uitofp_float_base_fast_i32(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to float
+; CHECK-NEXT:    [[POW:%.*]] = tail call fast float @llvm.pow.f32(float [[BASE]], float [[SUBFP]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
@@ -276,9 +299,10 @@ define double @pow_uitofp_float_base_fast_i32(float %base, i32 %x) {
 }
 
 define double @pow_uitofp_double_base_fast_i32(double %base, i32 %x) {
-; CHECK-LABEL: @pow_uitofp_double_base_fast_i32(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to double
-; CHECK-NEXT:    [[RES:%.*]] = tail call fast double @llvm.pow.f64(double [[BASE:%.*]], double [[SUBFP]])
+; CHECK-LABEL: define double @pow_uitofp_double_base_fast_i32(
+; CHECK-SAME: double [[BASE:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to double
+; CHECK-NEXT:    [[RES:%.*]] = tail call fast double @llvm.pow.f64(double [[BASE]], double [[SUBFP]])
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %subfp = uitofp i32 %x to double
@@ -287,14 +311,15 @@ define double @pow_uitofp_double_base_fast_i32(double %base, i32 %x) {
 }
 
 define double @pow_sitofp_const_base_fast_i64(i64 %x) {
-; CHECK-LABEL: @pow_sitofp_const_base_fast_i64(
-; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i64 [[X:%.*]] to float
-; Do not change 0x400675{{.*}} to the exact constant, see PR42740
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[SUBFP]], 0x400675{{.*}}
+; CHECK-LABEL: define double @pow_sitofp_const_base_fast_i64(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i64 [[X]] to float
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[SUBFP]], 0x4006757680000000
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[MUL]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
+; Do not change 0x400675{{.*}} to the exact constant, see PR42740
   %subfp = sitofp i64 %x to float
   %pow = tail call fast float @llvm.pow.f32(float 7.000000e+00, float %subfp)
   %res = fpext float %pow to double
@@ -302,9 +327,10 @@ define double @pow_sitofp_const_base_fast_i64(i64 %x) {
 }
 
 define double @pow_uitofp_const_base_fast_i64(i64 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_fast_i64(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i64 [[X:%.*]] to float
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[SUBFP]], 0x400675{{.*}}
+; CHECK-LABEL: define double @pow_uitofp_const_base_fast_i64(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i64 [[X]] to float
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[SUBFP]], 0x4006757680000000
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call fast float @llvm.exp2.f32(float [[MUL]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
 ; CHECK-NEXT:    ret double [[RES]]
@@ -316,8 +342,9 @@ define double @pow_uitofp_const_base_fast_i64(i64 %x) {
 }
 
 define double @pow_sitofp_const_base_no_fast(i32 %x) {
-; CHECK-LABEL: @pow_sitofp_const_base_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_sitofp_const_base_no_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X]] to float
 ; CHECK-NEXT:    [[POW:%.*]] = tail call float @llvm.pow.f32(float 7.000000e+00, float [[SUBFP]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
@@ -329,8 +356,9 @@ define double @pow_sitofp_const_base_no_fast(i32 %x) {
 }
 
 define double @pow_uitofp_const_base_no_fast(i32 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_uitofp_const_base_no_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to float
 ; CHECK-NEXT:    [[POW:%.*]] = tail call float @llvm.pow.f32(float 7.000000e+00, float [[SUBFP]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
@@ -342,8 +370,9 @@ define double @pow_uitofp_const_base_no_fast(i32 %x) {
 }
 
 define double @pow_sitofp_const_base_2_no_fast(i32 %x) {
-; CHECK-LABEL: @pow_sitofp_const_base_2_no_fast(
-; CHECK-NEXT:    [[LDEXPF:%.*]] = tail call float @ldexpf(float 1.000000e+00, i32 [[X:%.*]])
+; CHECK-LABEL: define double @pow_sitofp_const_base_2_no_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[LDEXPF:%.*]] = tail call float @ldexpf(float 1.000000e+00, i32 [[X]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[LDEXPF]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
@@ -354,8 +383,9 @@ define double @pow_sitofp_const_base_2_no_fast(i32 %x) {
 }
 
 define double @pow_sitofp_const_base_power_of_2_no_fast(i32 %x) {
-; CHECK-LABEL: @pow_sitofp_const_base_power_of_2_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_sitofp_const_base_power_of_2_no_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X]] to float
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[SUBFP]], 4.000000e+00
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @llvm.exp2.f32(float [[MUL]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
@@ -368,8 +398,9 @@ define double @pow_sitofp_const_base_power_of_2_no_fast(i32 %x) {
 }
 
 define double @pow_uitofp_const_base_2_no_fast(i32 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_2_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_uitofp_const_base_2_no_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to float
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @llvm.exp2.f32(float [[SUBFP]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
 ; CHECK-NEXT:    ret double [[RES]]
@@ -381,8 +412,9 @@ define double @pow_uitofp_const_base_2_no_fast(i32 %x) {
 }
 
 define double @pow_uitofp_const_base_power_of_2_no_fast(i32 %x) {
-; CHECK-LABEL: @pow_uitofp_const_base_power_of_2_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to float
+; CHECK-LABEL: define double @pow_uitofp_const_base_power_of_2_no_fast(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to float
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[SUBFP]], 4.000000e+00
 ; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @llvm.exp2.f32(float [[MUL]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[EXP2]] to double
@@ -395,9 +427,10 @@ define double @pow_uitofp_const_base_power_of_2_no_fast(i32 %x) {
 }
 
 define double @pow_sitofp_float_base_no_fast(float %base, i32 %x) {
-; CHECK-LABEL: @pow_sitofp_float_base_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X:%.*]] to float
-; CHECK-NEXT:    [[POW:%.*]] = tail call float @llvm.pow.f32(float [[BASE:%.*]], float [[SUBFP]])
+; CHECK-LABEL: define double @pow_sitofp_float_base_no_fast(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X]] to float
+; CHECK-NEXT:    [[POW:%.*]] = tail call float @llvm.pow.f32(float [[BASE]], float [[SUBFP]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
@@ -408,9 +441,10 @@ define double @pow_sitofp_float_base_no_fast(float %base, i32 %x) {
 }
 
 define double @pow_uitofp_float_base_no_fast(float %base, i32 %x) {
-; CHECK-LABEL: @pow_uitofp_float_base_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to float
-; CHECK-NEXT:    [[POW:%.*]] = tail call float @llvm.pow.f32(float [[BASE:%.*]], float [[SUBFP]])
+; CHECK-LABEL: define double @pow_uitofp_float_base_no_fast(
+; CHECK-SAME: float [[BASE:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to float
+; CHECK-NEXT:    [[POW:%.*]] = tail call float @llvm.pow.f32(float [[BASE]], float [[SUBFP]])
 ; CHECK-NEXT:    [[RES:%.*]] = fpext float [[POW]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
@@ -421,9 +455,10 @@ define double @pow_uitofp_float_base_no_fast(float %base, i32 %x) {
 }
 
 define double @pow_sitofp_double_base_no_fast(double %base, i32 %x) {
-; CHECK-LABEL: @pow_sitofp_double_base_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X:%.*]] to double
-; CHECK-NEXT:    [[POW:%.*]] = tail call double @llvm.pow.f64(double [[BASE:%.*]], double [[SUBFP]])
+; CHECK-LABEL: define double @pow_sitofp_double_base_no_fast(
+; CHECK-SAME: double [[BASE:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = sitofp i32 [[X]] to double
+; CHECK-NEXT:    [[POW:%.*]] = tail call double @llvm.pow.f64(double [[BASE]], double [[SUBFP]])
 ; CHECK-NEXT:    ret double [[POW]]
 ;
   %subfp = sitofp i32 %x to double
@@ -432,9 +467,10 @@ define double @pow_sitofp_double_base_no_fast(double %base, i32 %x) {
 }
 
 define double @pow_uitofp_double_base_no_fast(double %base, i32 %x) {
-; CHECK-LABEL: @pow_uitofp_double_base_no_fast(
-; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X:%.*]] to double
-; CHECK-NEXT:    [[POW:%.*]] = tail call double @llvm.pow.f64(double [[BASE:%.*]], double [[SUBFP]])
+; CHECK-LABEL: define double @pow_uitofp_double_base_no_fast(
+; CHECK-SAME: double [[BASE:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SUBFP:%.*]] = uitofp i32 [[X]] to double
+; CHECK-NEXT:    [[POW:%.*]] = tail call double @llvm.pow.f64(double [[BASE]], double [[SUBFP]])
 ; CHECK-NEXT:    ret double [[POW]]
 ;
   %subfp = uitofp i32 %x to double
@@ -445,8 +481,9 @@ define double @pow_uitofp_double_base_no_fast(double %base, i32 %x) {
 ; negative test - pow with no FMF is not the same as the loosely-specified powi
 
 define double @powf_exp_const_int_no_fast(double %base) {
-; CHECK-LABEL: @powf_exp_const_int_no_fast(
-; CHECK-NEXT:    [[RES:%.*]] = tail call double @llvm.pow.f64(double [[BASE:%.*]], double 4.000000e+01)
+; CHECK-LABEL: define double @powf_exp_const_int_no_fast(
+; CHECK-SAME: double [[BASE:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = tail call double @llvm.pow.f64(double [[BASE]], double 4.000000e+01)
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %res = tail call double @llvm.pow.f64(double %base, double 4.000000e+01)
@@ -454,10 +491,11 @@ define double @powf_exp_const_int_no_fast(double %base) {
 }
 
 define double @powf_exp_const_not_int_fast(double %base) {
-; CHECK-LABEL: @powf_exp_const_not_int_fast(
-; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[BASE:%.*]])
-; CHECK-NEXT:    [[POWI:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[BASE]], i32 37)
-; CHECK-NEXT:    [[RES:%.*]] = fmul fast double [[POWI]], [[SQRT]]
+; CHECK-LABEL: define double @powf_exp_const_not_int_fast(
+; CHECK-SAME: double [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = call fast double @llvm.sqrt.f64(double [[BASE]])
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.powi.f64.i32(double [[BASE]], i32 37)
+; CHECK-NEXT:    [[RES:%.*]] = fmul fast double [[TMP1]], [[SQRT]]
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %res = tail call fast double @llvm.pow.f64(double %base, double 3.750000e+01)
@@ -465,8 +503,9 @@ define double @powf_exp_const_not_int_fast(double %base) {
 }
 
 define double @powf_exp_const_not_int_no_fast(double %base) {
-; CHECK-LABEL: @powf_exp_const_not_int_no_fast(
-; CHECK-NEXT:    [[RES:%.*]] = tail call double @llvm.pow.f64(double [[BASE:%.*]], double 3.750000e+01)
+; CHECK-LABEL: define double @powf_exp_const_not_int_no_fast(
+; CHECK-SAME: double [[BASE:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = tail call double @llvm.pow.f64(double [[BASE]], double 3.750000e+01)
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %res = tail call double @llvm.pow.f64(double %base, double 3.750000e+01)
@@ -476,8 +515,9 @@ define double @powf_exp_const_not_int_no_fast(double %base) {
 ; negative test - pow with no FMF is not the same as the loosely-specified powi
 
 define double @powf_exp_const2_int_no_fast(double %base) {
-; CHECK-LABEL: @powf_exp_const2_int_no_fast(
-; CHECK-NEXT:    [[RES:%.*]] = tail call double @llvm.pow.f64(double [[BASE:%.*]], double -4.000000e+01)
+; CHECK-LABEL: define double @powf_exp_const2_int_no_fast(
+; CHECK-SAME: double [[BASE:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = tail call double @llvm.pow.f64(double [[BASE]], double -4.000000e+01)
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   %res = tail call double @llvm.pow.f64(double %base, double -4.000000e+01)
@@ -487,8 +527,9 @@ define double @powf_exp_const2_int_no_fast(double %base) {
 ; TODO: This could be transformed the same as scalar if there is an ldexp intrinsic.
 
 define <2 x float> @pow_sitofp_const_base_2_no_fast_vector(<2 x i8> %x) {
-; CHECK-LABEL: @pow_sitofp_const_base_2_no_fast_vector(
-; CHECK-NEXT:    [[S:%.*]] = sitofp <2 x i8> [[X:%.*]] to <2 x float>
+; CHECK-LABEL: define <2 x float> @pow_sitofp_const_base_2_no_fast_vector(
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
+; CHECK-NEXT:    [[S:%.*]] = sitofp <2 x i8> [[X]] to <2 x float>
 ; CHECK-NEXT:    [[EXP2:%.*]] = call <2 x float> @llvm.exp2.v2f32(<2 x float> [[S]])
 ; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/printf-i16.ll b/llvm/test/Transforms/InstCombine/printf-i16.ll
index c5868b5d5c4cd2..f9cc4ebfbcc342 100644
--- a/llvm/test/Transforms/InstCombine/printf-i16.ll
+++ b/llvm/test/Transforms/InstCombine/printf-i16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;
-; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=AVR
+; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
 ;
 ; Verify that the puts to putchar transformation works correctly even for
 ; targets with 16-bit int.
@@ -23,25 +23,42 @@ declare i16 @printf(ptr, ...)
 ; in the same output for calls with equivalent arguments.
 
 define void @xform_printf(i8 %c8, i16 %c16) {
-; CHECK-LABEL: @xform_printf(
-; CHECK-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 1)
-; CHECK-NEXT:    [[PUTCHAR1:%.*]] = call i16 @putchar(i16 1)
-; CHECK-NEXT:    [[PUTCHAR2:%.*]] = call i16 @putchar(i16 1)
-; CHECK-NEXT:    [[PUTCHAR3:%.*]] = call i16 @putchar(i16 127)
-; CHECK-NEXT:    [[PUTCHAR4:%.*]] = call i16 @putchar(i16 127)
-; CHECK-NEXT:    [[PUTCHAR5:%.*]] = call i16 @putchar(i16 127)
-; CHECK-NEXT:    [[PUTCHAR6:%.*]] = call i16 @putchar(i16 128)
-; CHECK-NEXT:    [[PUTCHAR7:%.*]] = call i16 @putchar(i16 128)
-; CHECK-NEXT:    [[PUTCHAR8:%.*]] = call i16 @putchar(i16 128)
-; CHECK-NEXT:    [[PUTCHAR9:%.*]] = call i16 @putchar(i16 255)
-; CHECK-NEXT:    [[PUTCHAR10:%.*]] = call i16 @putchar(i16 255)
-; CHECK-NEXT:    [[PUTCHAR11:%.*]] = call i16 @putchar(i16 255)
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[C8:%.*]] to i16
-; CHECK-NEXT:    [[PUTCHAR12:%.*]] = call i16 @putchar(i16 [[TMP1]])
-; CHECK-NEXT:    [[PUTCHAR13:%.*]] = call i16 @putchar(i16 [[C16:%.*]])
-; CHECK-NEXT:    ret void
+; AVR-LABEL: @xform_printf(
+; AVR-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR1:%.*]] = call i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR2:%.*]] = call i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR3:%.*]] = call i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR4:%.*]] = call i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR5:%.*]] = call i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR6:%.*]] = call i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR7:%.*]] = call i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR8:%.*]] = call i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR9:%.*]] = call i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR10:%.*]] = call i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR11:%.*]] = call i16 @putchar(i16 255)
+; AVR-NEXT:    [[TMP1:%.*]] = zext i8 [[C8:%.*]] to i16
+; AVR-NEXT:    [[PUTCHAR12:%.*]] = call i16 @putchar(i16 [[TMP1]])
+; AVR-NEXT:    [[PUTCHAR13:%.*]] = call i16 @putchar(i16 [[C16:%.*]])
+; AVR-NEXT:    ret void
+;
+; MSP430-LABEL: @xform_printf(
+; MSP430-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 1)
+; MSP430-NEXT:    [[PUTCHAR1:%.*]] = call i16 @putchar(i16 1)
+; MSP430-NEXT:    [[PUTCHAR2:%.*]] = call i16 @putchar(i16 1)
+; MSP430-NEXT:    [[PUTCHAR3:%.*]] = call i16 @putchar(i16 127)
+; MSP430-NEXT:    [[PUTCHAR4:%.*]] = call i16 @putchar(i16 127)
+; MSP430-NEXT:    [[PUTCHAR5:%.*]] = call i16 @putchar(i16 127)
+; MSP430-NEXT:    [[PUTCHAR6:%.*]] = call i16 @putchar(i16 128)
+; MSP430-NEXT:    [[PUTCHAR7:%.*]] = call i16 @putchar(i16 128)
+; MSP430-NEXT:    [[PUTCHAR8:%.*]] = call i16 @putchar(i16 128)
+; MSP430-NEXT:    [[PUTCHAR9:%.*]] = call i16 @putchar(i16 255)
+; MSP430-NEXT:    [[PUTCHAR10:%.*]] = call i16 @putchar(i16 255)
+; MSP430-NEXT:    [[PUTCHAR11:%.*]] = call i16 @putchar(i16 255)
+; MSP430-NEXT:    [[TMP1:%.*]] = zext i8 [[C8:%.*]] to i16
+; MSP430-NEXT:    [[PUTCHAR12:%.*]] = call i16 @putchar(i16 [[TMP1]])
+; MSP430-NEXT:    [[PUTCHAR13:%.*]] = call i16 @putchar(i16 [[C16:%.*]])
+; MSP430-NEXT:    ret void
 ;
-
   call i16 (ptr, ...) @printf(ptr @s1)
   call i16 (ptr, ...) @printf(ptr @pcnt_c, i16 1)
   call i16 (ptr, ...) @printf(ptr @pcnt_s, ptr @s1)
diff --git a/llvm/test/Transforms/InstCombine/puts-i16.ll b/llvm/test/Transforms/InstCombine/puts-i16.ll
index 58d12ab9cd2ab7..92769083a8affe 100644
--- a/llvm/test/Transforms/InstCombine/puts-i16.ll
+++ b/llvm/test/Transforms/InstCombine/puts-i16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;
-; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s --check-prefix=AVR
+; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
 ;
 ; Test that the puts to putchar transformation works correctly even for
 ; targets with 16-bit int.
@@ -12,11 +12,15 @@ declare i16 @puts(ptr)
 @empty = constant [1 x i8] c"\00"
 
 define void @xform_puts(i16 %c) {
-; CHECK-LABEL: @xform_puts(
-; CHECK-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 10)
-; CHECK-NEXT:    ret void
-;
 ; Transform puts("") to putchar("\n").
+; AVR-LABEL: @xform_puts(
+; AVR-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 10)
+; AVR-NEXT:    ret void
+;
+; MSP430-LABEL: @xform_puts(
+; MSP430-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 10)
+; MSP430-NEXT:    ret void
+;
   call i16 @puts(ptr @empty)
 
   ret void
diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
new file mode 100644
index 00000000000000..4f9396add2370b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
@@ -0,0 +1,1208 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+declare float @llvm.fabs.f32(float)
+declare float @llvm.copysign.f32(float, float)
+declare void @llvm.assume(i1 noundef)
+declare float @llvm.log2.f32(float)
+declare float @llvm.exp2.f32(float)
+declare float @llvm.trunc.f32(float)
+declare float @llvm.arithmetic.fence.f32(float)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+
+
+define float @ninf_user_select_inf(i1 %cond, float %x, float %y) {
+; CHECK-LABEL: define float @ninf_user_select_inf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    [[NINF_USER:%.*]] = fmul ninf float [[SELECT]], [[Y]]
+; CHECK-NEXT:    ret float [[NINF_USER]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %ninf.user = fmul ninf float %y, %select
+  ret float %ninf.user
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf_poison() {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf_poison() {
+; CHECK-NEXT:    ret float poison
+;
+  ret float poison
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf_undef() {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf_undef() {
+; CHECK-NEXT:    ret float undef
+;
+  ret float undef
+}
+
+; Make sure there's no infinite loop
+define nofpclass(all) float @ret_nofpclass_all_var(float %arg) {
+; CHECK-LABEL: define nofpclass(all) float @ret_nofpclass_all_var
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    ret float poison
+;
+  ret float %arg
+}
+
+; Make sure there's no infinite loop
+define nofpclass(all) <2 x float> @ret_nofpclass_all_var_vector(<2 x float> %arg) {
+; CHECK-LABEL: define nofpclass(all) <2 x float> @ret_nofpclass_all_var_vector
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    ret <2 x float> poison
+;
+  ret <2 x float> %arg
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf__0() {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__0() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  ret float 0.0
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf__pinf() {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__pinf() {
+; CHECK-NEXT:    ret float poison
+;
+  ret float 0x7FF0000000000000
+}
+
+define nofpclass(pinf) float @ret_nofpclass_pinf__pinf() {
+; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__pinf() {
+; CHECK-NEXT:    ret float poison
+;
+  ret float 0x7FF0000000000000
+}
+
+define nofpclass(pinf) float @ret_nofpclass_pinf__ninf() {
+; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__ninf() {
+; CHECK-NEXT:    ret float 0xFFF0000000000000
+;
+  ret float 0xFFF0000000000000
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf__ninf() {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__ninf() {
+; CHECK-NEXT:    ret float poison
+;
+  ret float 0xFFF0000000000000
+}
+
+; Negative test, do nothing
+define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_inf_lhs(i1 %cond, float nofpclass(inf) %x, float %y) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_inf_lhs
+; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(inf) [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float [[Y]]
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %select = select i1 %cond, float %x, float %y
+  ret float %select
+}
+
+; Make sure nofpclass from source argument is used, fold to %y
+define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lhs(i1 %cond, float nofpclass(nan norm zero sub) %x, float %y) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_lhs
+; CHECK-SAME: (i1 [[COND:%.*]], float nofpclass(nan zero sub norm) [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %select = select i1 %cond, float %x, float %y
+  ret float %select
+}
+
+; Make sure nofpclass from source argument is used, fold to %x
+define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rhs(i1 %cond, float %x, float nofpclass(nan norm zero sub) %y) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_arg_only_inf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float nofpclass(nan zero sub norm) [[Y:%.*]]) {
+; CHECK-NEXT:    ret float [[X]]
+;
+  %select = select i1 %cond, float %x, float %y
+  ret float %select
+}
+
+; Fold to ret %y
+define nofpclass(inf) [3 x [2 x float]] @ret_float_array(i1 %cond, [3 x [2 x float]] nofpclass(nan norm zero sub) %x, [3 x [2 x float]] %y) {
+; CHECK-LABEL: define nofpclass(inf) [3 x [2 x float]] @ret_float_array
+; CHECK-SAME: (i1 [[COND:%.*]], [3 x [2 x float]] nofpclass(nan zero sub norm) [[X:%.*]], [3 x [2 x float]] [[Y:%.*]]) {
+; CHECK-NEXT:    ret [3 x [2 x float]] [[Y]]
+;
+  %select = select i1 %cond, [3 x [2 x float]] %x, [3 x [2 x float]] %y
+  ret [3 x [2 x float ]] %select
+}
+
+; Fold to ret %x
+define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_lhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float [[X]]
+;
+  %select = select i1 %cond, float 0x7FF0000000000000, float %x
+  ret float %select
+}
+
+; Fold to ret %x
+define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float [[X]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  ret float %select
+}
+
+; Fold to poison
+define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_pinf_or_ninf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float poison
+;
+  %select = select i1 %cond, float 0x7FF0000000000000, float 0xFFF0000000000000
+  ret float %select
+}
+
+; Fold to poison
+define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_ninf_or_pinf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float poison
+;
+  %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
+  ret float %select
+}
+
+; Fold to pos inf
+define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__select_ninf_or_pinf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float 0x7FF0000000000000
+;
+  %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
+  ret float %select
+}
+
+; Fold to neg inf
+define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__select_ninf_or_pinf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float 0xFFF0000000000000
+;
+  %select = select i1 %cond, float 0xFFF0000000000000, float 0x7FF0000000000000
+  ret float %select
+}
+
+; Fold to poison
+define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(zero) float @ret_nofpclass_zero__select_pzero_or_nzero
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float poison
+;
+  %select = select i1 %cond, float 0.0, float -0.0
+  ret float %select
+}
+
+; Fold to +0
+define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nzero) float @ret_nofpclass_nzero__select_pzero_or_nzero
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %select = select i1 %cond, float 0.0, float -0.0
+  ret float %select
+}
+
+; Fold to -0
+define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(pzero) float @ret_nofpclass_pzero__select_pzero_or_nzero
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %select = select i1 %cond, float 0.0, float -0.0
+  ret float %select
+}
+
+; Fold to ret %x
+define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector(<2 x i1> %cond, <2 x float> %x) {
+; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector
+; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    ret <2 x float> [[X]]
+;
+  %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000>, <2 x float> %x
+  ret <2 x float> %select
+}
+
+; Fold to ret %x
+define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_undef(<2 x i1> %cond, <2 x float> %x) {
+; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_pinf_lhs_vector_undef
+; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    ret <2 x float> [[X]]
+;
+  %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float poison>, <2 x float> %x
+  ret <2 x float> %select
+}
+
+; Fold to ret %x
+define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_mixed_inf_lhs_vector(<2 x i1> %cond, <2 x float> %x) {
+; CHECK-LABEL: define nofpclass(inf) <2 x float> @ret_nofpclass_inf__select_mixed_inf_lhs_vector
+; CHECK-SAME: (<2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]]) {
+; CHECK-NEXT:    ret <2 x float> [[X]]
+;
+  %select = select <2 x i1> %cond, <2 x float> <float 0x7FF0000000000000, float 0xFFF0000000000000>, <2 x float> %x
+  ret <2 x float> %select
+}
+
+; Can't delete the select
+define nofpclass(inf) float @ret_nofpclass_inf__select_multi_use_pinf_lhs(i1 %cond, float %x, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_multi_use_pinf_lhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
+; CHECK-NEXT:    store float [[SELECT]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %select = select i1 %cond, float 0x7FF0000000000000, float %x
+  store float %select, ptr %ptr
+  ret float %select
+}
+
+; Can't do anything
+define nofpclass(inf) float @ret_nofpclass_inf__select_p0_lhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_p0_lhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0.000000e+00, float [[X]]
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %select = select i1 %cond, float 0.0, float %x
+  ret float %select
+}
+
+; Can't do anything
+define nofpclass(inf) float @ret_nofpclass_inf__select_p0_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_p0_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0.000000e+00
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %select = select i1 %cond, float %x, float 0.0
+  ret float %select
+}
+
+; Can't do anything
+define nofpclass(nan) float @ret_nofpclass_nan__select_pinf_lhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_nofpclass_nan__select_pinf_lhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %select = select i1 %cond, float 0x7FF0000000000000, float %x
+  ret float %select
+}
+
+; Can't do anything
+define nofpclass(nan) float @ret_nofpclass_nan__select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_nofpclass_nan__select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  ret float %select
+}
+
+define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_nofpclass_inf_nan__select_chain_inf_nan_0
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float [[X]]
+;
+  %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
+  %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
+  ret float %select1
+}
+
+define nofpclass(inf nan) float @ret_nofpclass_inf_nan__select_chain_inf_nan_1(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_nofpclass_inf_nan__select_chain_inf_nan_1
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float poison
+;
+  %select0 = select i1 %cond, float %x, float 0x7FF8000000000000
+  %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
+  ret float %select1
+}
+
+define nofpclass(nan) float @ret_nofpclass_nan__select_chain_inf_nan(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_nofpclass_nan__select_chain_inf_nan
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], float 0x7FF0000000000000, float [[X]]
+; CHECK-NEXT:    ret float [[SELECT1]]
+;
+  %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
+  %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
+  ret float %select1
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_0
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float [[X]]
+;
+  %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
+  %select1 = select i1 %cond, float 0x7FF0000000000000, float %select0
+  ret float %select1
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %select0 = select i1 %cond, float 0x7FF8000000000000, float %x
+  %select1 = select i1 %cond, float %select0, float 0x7FF0000000000000
+  ret float %select1
+}
+
+; Simplify to fabs %x
+define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %select = select i1 %cond, float %x, float 0xFFF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; Simplify to fabs %x
+define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__fabs_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives__fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; Fold to fabs(%x), preserving a possible nan's payload bits
+define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_nan__fabs_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives_nan__fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; Can fold to poison
+define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_nan__fabs_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nan__fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float poison
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; Simplify to fneg %x
+define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    ret float [[FNEG]]
+;
+  %select = select i1 %cond, float %x, float 0xFFF0000000000000
+  %fneg = fneg float %select
+  ret float %fneg
+}
+
+; Simplify to fneg %x
+define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    ret float [[FNEG]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fneg = fneg float %select
+  ret float %fneg
+}
+
+; Simplify to fneg %x
+define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    ret float [[FNEG]]
+;
+  %select = select i1 %cond, float 0xFFF0000000000000, float %x
+  %fneg = fneg float %select
+  ret float %fneg
+}
+
+define nofpclass(pzero psub pnorm pinf) float @ret_nofpclass_nopositives___fneg_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives___fneg_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[SELECT]]
+; CHECK-NEXT:    ret float [[FNEG]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fneg = fneg float %select
+  ret float %fneg
+}
+
+; Fold to fneg fabs
+define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
+; CHECK-NEXT:    ret float [[FNEG]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %fneg = fneg float %fabs
+  ret float %fneg
+}
+
+; Fold to fneg fabs, may need to preserve a nan payload
+define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
+; CHECK-NEXT:    ret float [[FNEG]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %fneg = fneg float %fabs
+  ret float %fneg
+}
+
+
+; Fold to poison
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_nonan__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_nonan__fneg_fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float poison
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %fneg = fneg float %fabs
+  ret float %fneg
+}
+
+; should fold to ret copysign(%x)
+define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[X]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float %unknown.sign)
+  ret float %copysign
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float 1.0)
+  ret float %copysign
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float -1.0)
+  ret float %copysign
+}
+
+; can fold to fneg(fabs(x))
+define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysign(float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives_copysign
+; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
+  ret float %copysign
+}
+
+; can fold to fneg(fabs(x))
+define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives_copysign_nnan_flag(float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives_copysign_nnan_flag
+; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg nnan float [[TMP1]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %x, float %unknown.sign)
+  ret float %copysign
+}
+
+; can fold to fneg(fabs(x))
+define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_nopositives_nonan_copysign(float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_nopositives_nonan_copysign
+; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
+  ret float %copysign
+}
+
+; can fold to fabs(x)
+define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysign(float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_copysign
+; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
+  ret float %copysign
+}
+
+; can fold to fabs(x)
+define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_copysign_nnan_flag(float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_copysign_nnan_flag
+; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %x, float %unknown.sign)
+  ret float %copysign
+}
+
+; can fold to fabs(x)
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_nonan_copysign(float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives_nonan_copysign
+; CHECK-SAME: (float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %x, float %unknown.sign)
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs(i1 %cond, float %x, float %sign) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[SIGN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fabs.sign = call float @llvm.fabs.f32(float %sign)
+  %copysign = call float @llvm.copysign.f32(float %select, float %fabs.sign)
+  ret float %copysign
+}
+
+; Can fold to copysign %x
+define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_no_negatives_noinf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_no_negatives_noinf__copysign_unknown_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float %unknown.sign)
+  ret float %copysign
+}
+
+; Can fold to copysign %x
+define nofpclass(inf pnorm psub pzero) float @ret_nofpclass_no_positives_noinf__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_nofpclass_no_positives_noinf__copysign_unknown_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float %unknown.sign)
+  ret float %copysign
+}
+
+; Can't fold because it could have nan payload bits
+define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives__copysign_unknown_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float %unknown.sign)
+  ret float %copysign
+}
+
+; Can't fold because it could be nan
+define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__copysign_unknown_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float %unknown.sign)
+  ret float %copysign
+}
+
+; Could fold to copysign with constant
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_nonan__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_no_negatives_nonan__copysign_unknown_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float %unknown.sign)
+  ret float %copysign
+}
+
+; Could fold to copysign %x with constant
+define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_nonan__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nonan__copysign_unknown_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SELECT]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %copysign = call float @llvm.copysign.f32(float %select, float %unknown.sign)
+  ret float %copysign
+}
+
+; Do nothing
+define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_negatives__select_clamp_neg_to_zero(float %x) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_negatives__select_clamp_neg_to_zero
+; CHECK-SAME: (float [[X:%.*]]) {
+; CHECK-NEXT:    [[IS_LT_ZERO:%.*]] = fcmp olt float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_LT_ZERO]], float 0.000000e+00, float [[X]]
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %is.lt.zero = fcmp olt float %x, 0.0
+  %select = select i1 %is.lt.zero, float 0.0, float %x
+  ret float %select
+}
+
+; Can fold to ret %x, assumed to be nan
+define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_negatives__select_clamp_pos_to_zero(float %x) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_negatives__select_clamp_pos_to_zero
+; CHECK-SAME: (float [[X:%.*]]) {
+; CHECK-NEXT:    [[IS_GT_ZERO:%.*]] = fcmp ogt float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_GT_ZERO]], float 0.000000e+00, float [[X]]
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %is.gt.zero = fcmp ogt float %x, 0.0
+  %select = select i1 %is.gt.zero, float 0.0, float %x
+  ret float %select
+}
+
+; Can fold to ret +0
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nan_negatives__select_clamp_pos_to_zero(float %x) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_nofpclass_nan_negatives__select_clamp_pos_to_zero
+; CHECK-SAME: (float [[X:%.*]]) {
+; CHECK-NEXT:    [[IS_GT_ZERO:%.*]] = fcmp ogt float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_GT_ZERO]], float 0.000000e+00, float [[X]]
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %is.gt.zero = fcmp ogt float %x, 0.0
+  %select = select i1 %is.gt.zero, float 0.0, float %x
+  ret float %select
+}
+
+; Can fold to ret poison
+define nofpclass(nan ninf nnorm nsub zero) float @ret_nofpclass_nan_negatives_zero__select_clamp_pos_to_zero(float %x) {
+; CHECK-LABEL: define nofpclass(nan ninf zero nsub nnorm) float @ret_nofpclass_nan_negatives_zero__select_clamp_pos_to_zero
+; CHECK-SAME: (float [[X:%.*]]) {
+; CHECK-NEXT:    ret float [[X]]
+;
+  %is.gt.zero = fcmp ogt float %x, 0.0
+  %select = select i1 %is.gt.zero, float 0.0, float %x
+  ret float %select
+}
+
+; Can fold to ret %x, assumed to be nan
+define nofpclass(ninf nnorm nsub zero) float @ret_nofpclass_negatives_zero__select_clamp_pos_to_zero(float %x) {
+; CHECK-LABEL: define nofpclass(ninf zero nsub nnorm) float @ret_nofpclass_negatives_zero__select_clamp_pos_to_zero
+; CHECK-SAME: (float [[X:%.*]]) {
+; CHECK-NEXT:    ret float [[X]]
+;
+  %is.gt.zero = fcmp ogt float %x, 0.0
+  %select = select i1 %is.gt.zero, float 0.0, float %x
+  ret float %select
+}
+
+; Assume should allow folding ret %y
+define nofpclass(inf) float @ret_nofpclass_noinfs__assumed_isinf__select_pinf_lhs(i1 %cond, float %x, float %y) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_noinfs__assumed_isinf__select_pinf_lhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[X_IS_INF:%.*]] = fcmp oeq float [[FABS_X]], 0x7FF0000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 [[X_IS_INF]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %x.is.inf = fcmp oeq float %fabs.x, 0x7FF0000000000000
+  call void @llvm.assume(i1 %x.is.inf)
+  %select = select i1 %cond, float %x, float %y
+  ret float %select
+}
+
+; Ideally should be able to fold out everything after the exp2 call.
+define nofpclass(nan inf nzero nsub nnorm) float @powr_issue64870(float nofpclass(nan inf) %x, float nofpclass(nan inf) %y) {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @powr_issue64870
+; CHECK-SAME: (float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[I1:%.*]] = tail call float @llvm.log2.f32(float [[I]])
+; CHECK-NEXT:    [[I2:%.*]] = fmul float [[I1]], [[Y]]
+; CHECK-NEXT:    [[I3:%.*]] = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float [[I2]])
+; CHECK-NEXT:    [[I6:%.*]] = fcmp oeq float [[X]], 0.000000e+00
+; CHECK-NEXT:    [[I7:%.*]] = select i1 [[I6]], float 0.000000e+00, float [[I3]]
+; CHECK-NEXT:    [[I8:%.*]] = fcmp oeq float [[Y]], 0.000000e+00
+; CHECK-NEXT:    [[I11:%.*]] = fcmp oeq float [[X]], 1.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[I11]], i1 true, i1 [[I8]]
+; CHECK-NEXT:    [[I12:%.*]] = select i1 [[TMP0]], float 1.000000e+00, float [[I7]]
+; CHECK-NEXT:    ret float [[I12]]
+;
+entry:
+  %i = tail call float @llvm.fabs.f32(float %x)
+  %i1 = tail call float @llvm.log2.f32(float %i)
+  %i2 = fmul float %i1, %y
+  %i3 = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float %i2)
+  %i4 = fcmp olt float %y, 0.000000e+00
+  %i5 = select i1 %i4, float 0x7FF0000000000000, float 0.000000e+00
+  %i6 = fcmp oeq float %x, 0.000000e+00
+  %i7 = select i1 %i6, float %i5, float %i3
+  %i8 = fcmp oeq float %y, 0.000000e+00
+  %i9 = select i1 %i6, float 0x7FF8000000000000, float 1.000000e+00
+  %i10 = select i1 %i8, float %i9, float %i7
+  %i11 = fcmp oeq float %x, 1.000000e+00
+  %i12 = select i1 %i11, float 1.000000e+00, float %i10
+  %i13 = fcmp olt float %x, 0.000000e+00
+  %i14 = select i1 %i13, float 0x7FF8000000000000, float %i12
+  ret float %i14
+}
+
+; Different implementation of powr
+define nofpclass(nan inf nzero nsub nnorm) float @test_powr_issue64870_2(float nofpclass(nan inf) %arg, float nofpclass(nan inf) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @test_powr_issue64870_2
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
+; CHECK-NEXT:    [[I2:%.*]] = select i1 [[I]], float 0x7FF8000000000000, float [[ARG]]
+; CHECK-NEXT:    [[I3:%.*]] = tail call float @llvm.log2.f32(float noundef [[I2]])
+; CHECK-NEXT:    [[I4:%.*]] = select i1 [[I]], float 0x7FF8000000000000, float [[ARG1]]
+; CHECK-NEXT:    [[I5:%.*]] = fmul float [[I4]], [[I3]]
+; CHECK-NEXT:    [[I6:%.*]] = tail call noundef nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float noundef [[I5]])
+; CHECK-NEXT:    [[I10:%.*]] = fcmp oeq float [[I2]], 0.000000e+00
+; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I10]], float 0.000000e+00, float [[I6]]
+; CHECK-NEXT:    ret float [[I12]]
+;
+bb:
+  %i = fcmp olt float %arg, 0.000000e+00
+  %i2 = select i1 %i, float 0x7FF8000000000000, float %arg
+  %i3 = tail call float @llvm.log2.f32(float noundef %i2) #2
+  %i4 = select i1 %i, float 0x7FF8000000000000, float %arg1
+  %i5 = fmul float %i4, %i3
+  %i6 = tail call noundef nofpclass(ninf nzero nsub nnorm) float @llvm.exp2.f32(float noundef %i5)
+  %i7 = fcmp olt float %i4, 0.000000e+00
+  %i8 = select i1 %i7, float 0x7FF0000000000000, float 0.000000e+00
+  %i9 = fcmp ueq float %i4, 0.000000e+00
+  %i10 = fcmp oeq float %i2, 0.000000e+00
+  %i11 = select i1 %i9, float 0x7FF8000000000000, float %i8
+  %i12 = select i1 %i10, float %i11, float %i6
+  ret float %i12
+}
+
+; implementation of pow with some prunable cases
+define nofpclass(nan inf) float @pow_f32(float nofpclass(nan inf) %arg, float nofpclass(nan inf) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan inf) float @pow_f32
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I:%.*]] = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float noundef [[ARG]])
+; CHECK-NEXT:    [[I2:%.*]] = tail call float @llvm.log2.f32(float noundef [[I]])
+; CHECK-NEXT:    [[I3:%.*]] = fmul float [[I2]], [[ARG1]]
+; CHECK-NEXT:    [[I4:%.*]] = tail call noundef float @llvm.exp2.f32(float noundef [[I3]])
+; CHECK-NEXT:    [[I5:%.*]] = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float noundef [[ARG1]])
+; CHECK-NEXT:    [[I6:%.*]] = tail call float @llvm.trunc.f32(float noundef [[I5]])
+; CHECK-NEXT:    [[I7:%.*]] = fcmp oeq float [[I6]], [[I5]]
+; CHECK-NEXT:    [[I8:%.*]] = fmul float [[I5]], 5.000000e-01
+; CHECK-NEXT:    [[I9:%.*]] = tail call float @llvm.trunc.f32(float noundef [[I8]])
+; CHECK-NEXT:    [[I10:%.*]] = fcmp une float [[I9]], [[I8]]
+; CHECK-NEXT:    [[I11:%.*]] = and i1 [[I7]], [[I10]]
+; CHECK-NEXT:    [[I12:%.*]] = select i1 [[I11]], float [[ARG]], float 1.000000e+00
+; CHECK-NEXT:    [[I13:%.*]] = tail call noundef float @llvm.copysign.f32(float noundef [[I4]], float noundef [[I12]])
+; CHECK-NEXT:    [[I17:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
+; CHECK-NEXT:    [[I21:%.*]] = select i1 [[I11]], float [[ARG]], float 0.000000e+00
+; CHECK-NEXT:    [[I22:%.*]] = tail call noundef nofpclass(nan sub norm) float @llvm.copysign.f32(float noundef 0.000000e+00, float noundef [[I21]])
+; CHECK-NEXT:    [[I23:%.*]] = select i1 [[I17]], float [[I22]], float [[I13]]
+; CHECK-NEXT:    [[I24:%.*]] = fcmp oeq float [[ARG]], 1.000000e+00
+; CHECK-NEXT:    [[I25:%.*]] = fcmp oeq float [[ARG1]], 0.000000e+00
+; CHECK-NEXT:    [[I26:%.*]] = or i1 [[I24]], [[I25]]
+; CHECK-NEXT:    [[I27:%.*]] = select i1 [[I26]], float 1.000000e+00, float [[I23]]
+; CHECK-NEXT:    ret float [[I27]]
+;
+bb:
+  %i = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float noundef %arg)
+  %i2 = tail call float @llvm.log2.f32(float noundef %i)
+  %i3 = fmul float %i2, %arg1
+  %i4 = tail call noundef float @llvm.exp2.f32(float noundef %i3)
+  %i5 = tail call nofpclass(ninf nzero nsub nnorm) float @llvm.fabs.f32(float noundef %arg1)
+  %i6 = tail call float @llvm.trunc.f32(float noundef %i5)
+  %i7 = fcmp oeq float %i6, %i5
+  %i8 = fmul float %i5, 5.000000e-01
+  %i9 = tail call float @llvm.trunc.f32(float noundef %i8)
+  %i10 = fcmp une float %i9, %i8
+  %i11 = and i1 %i7, %i10
+  %i12 = select i1 %i11, float %arg, float 1.000000e+00
+  %i13 = tail call noundef float @llvm.copysign.f32(float noundef %i4, float noundef %i12)
+  %i14 = fcmp olt float %arg, 0.000000e+00
+  %i15 = select i1 %i7, float %i13, float 0x7FF8000000000000
+  %i16 = select i1 %i14, float %i15, float %i13
+  %i17 = fcmp oeq float %arg, 0.000000e+00
+  %i18 = fcmp olt float %arg1, 0.000000e+00
+  %i19 = xor i1 %i17, %i18
+  %i20 = select i1 %i19, float 0.000000e+00, float 0x7FF0000000000000
+  %i21 = select i1 %i11, float %arg, float 0.000000e+00
+  %i22 = tail call noundef nofpclass(nan sub norm) float @llvm.copysign.f32(float noundef %i20, float noundef %i21)
+  %i23 = select i1 %i17, float %i22, float %i16
+  %i24 = fcmp oeq float %arg, 1.000000e+00
+  %i25 = fcmp oeq float %arg1, 0.000000e+00
+  %i26 = or i1 %i24, %i25
+  %i27 = select i1 %i26, float 1.000000e+00, float %i23
+  ret float %i27
+}
+
+declare float @extern()
+
+; Make sure nofpclass from arbitrary callsite is used, fold to %y
+define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_call_only_inf(i1 %cond, float %y) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_nofpclass_call_only_inf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
+  %select = select i1 %cond, float %must.be.inf, float %y
+  ret float %select
+}
+
+define nofpclass(pinf) float @ret_nofpclass_pinf__nofpclass_call_only_inf(i1 %cond, float %y) {
+; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__nofpclass_call_only_inf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
+; CHECK-NEXT:    ret float 0xFFF0000000000000
+;
+  %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
+  ret float %must.be.inf
+}
+
+define nofpclass(ninf) float @ret_nofpclass_ninf__nofpclass_call_only_inf(i1 %cond, float %y) {
+; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__nofpclass_call_only_inf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call nofpclass(nan zero sub norm) float @extern()
+; CHECK-NEXT:    ret float 0x7FF0000000000000
+;
+  %must.be.inf = call nofpclass(nan norm zero sub) float @extern()
+  ret float %must.be.inf
+}
+
+define nofpclass(nzero) float @ret_nofpclass_nzero__nofpclass_call_only_zero(i1 %cond, float %y) {
+; CHECK-LABEL: define nofpclass(nzero) float @ret_nofpclass_nzero__nofpclass_call_only_zero
+; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[MUST_BE_ZERO:%.*]] = call nofpclass(nan inf sub norm) float @extern()
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %must.be.zero = call nofpclass(nan sub norm inf) float @extern()
+  ret float %must.be.zero
+}
+
+define nofpclass(pzero) float @ret_nofpclass_pzero__nofpclass_call_only_zero(i1 %cond, float %y) {
+; CHECK-LABEL: define nofpclass(pzero) float @ret_nofpclass_pzero__nofpclass_call_only_zero
+; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[MUST_BE_ZERO:%.*]] = call nofpclass(nan inf sub norm) float @extern()
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %must.be.zero = call nofpclass(nan sub norm inf) float @extern()
+  ret float %must.be.zero
+}
+
+; Should not fold this, should not assume payload/sign bits are canonical
+define nofpclass(qnan) float @ret_nofpclass_qnan__nofpclass_call_only_nan(i1 %cond, float %y) {
+; CHECK-LABEL: define nofpclass(qnan) float @ret_nofpclass_qnan__nofpclass_call_only_nan
+; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[MUST_BE_NAN:%.*]] = call nofpclass(inf zero sub norm) float @extern()
+; CHECK-NEXT:    ret float [[MUST_BE_NAN]]
+;
+  %must.be.nan = call nofpclass(inf norm zero sub) float @extern()
+  ret float %must.be.nan
+}
+
+; Should not fold this, should not assume payload/sign bits are canonical
+define nofpclass(snan) float @ret_nofpclass_snan__nofpclass_call_only_nan(i1 %cond, float %y) {
+; CHECK-LABEL: define nofpclass(snan) float @ret_nofpclass_snan__nofpclass_call_only_nan
+; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[MUST_BE_NAN:%.*]] = call nofpclass(inf zero sub norm) float @extern()
+; CHECK-NEXT:    ret float [[MUST_BE_NAN]]
+;
+  %must.be.nan = call nofpclass(inf norm zero sub) float @extern()
+  ret float %must.be.nan
+}
+
+; Assume call should allow folding this to %y
+; TODO: Not working, multiple user problem.
+define nofpclass(inf) float @ret_nofpclass_inf__select_assumed_call_result_only_inf(i1 %cond, float %y) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__select_assumed_call_result_only_inf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[MUST_BE_INF:%.*]] = call float @extern()
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[MUST_BE_INF]])
+; CHECK-NEXT:    [[IS_INF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 [[IS_INF]])
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[MUST_BE_INF]], float [[Y]]
+; CHECK-NEXT:    ret float [[SELECT]]
+;
+  %must.be.inf = call float @extern()
+  %fabs = call float @llvm.fabs.f32(float %must.be.inf)
+  %is.inf = fcmp oeq float %fabs, 0x7FF0000000000000
+  call void @llvm.assume(i1 %is.inf)
+  %select = select i1 %cond, float %must.be.inf, float %y
+  ret float %select
+}
+
+define nofpclass(inf) float @ret_nofpclass_inf__simple_phi_inf_or_unknown(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__simple_phi_inf_or_unknown
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND]], label [[BB0:%.*]], label [[RET:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[RET]]
+; CHECK:       ret:
+; CHECK-NEXT:    [[PHI:%.*]] = phi float [ 0x7FF0000000000000, [[ENTRY:%.*]] ], [ [[X]], [[BB0]] ]
+; CHECK-NEXT:    ret float [[PHI]]
+;
+entry:
+  br i1 %cond, label %bb0, label %ret
+
+bb0:
+  br label %ret
+
+ret:
+  %phi = phi float [ 0x7FF0000000000000, %entry ], [ %x, %bb0 ]
+  ret float %phi
+}
+
+declare i1 @loop.cond()
+
+declare float @loop.func()
+
+
+; Should be able to fold inf initial value to poison
+define nofpclass(inf) float @ret_nofpclass_inf__phi_0(i1 %cond0, float %unknown) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__phi_0
+; CHECK-SAME: (i1 [[COND0:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND0]], label [[LOOP:%.*]], label [[RET:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_LOOP:%.*]] = phi float [ 0x7FF0000000000000, [[ENTRY:%.*]] ], [ [[LOOP_FUNC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[LOOP_FUNC]] = call nofpclass(nan) float @loop.func()
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @loop.cond()
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]]
+; CHECK:       ret:
+; CHECK-NEXT:    [[PHI_RET:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[PHI_LOOP]], [[LOOP]] ]
+; CHECK-NEXT:    ret float [[PHI_RET]]
+;
+entry:
+  br i1 %cond0, label %loop, label %ret
+
+loop:
+  %phi.loop = phi float [ 0x7FF0000000000000, %entry ], [ %loop.func, %loop ]
+  %loop.func = call nofpclass(nan) float @loop.func()
+  %loop.cond = call i1 @loop.cond()
+  br i1 %loop.cond, label %ret, label %loop
+
+ret:
+  %phi.ret = phi float [ 0.0, %entry ], [ %phi.loop, %loop ]
+  ret float %phi.ret
+}
+
+; fold to ret 0
+define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_0(i1 %cond0, float %unknown) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_0
+; CHECK-SAME: (i1 [[COND0:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND0]], label [[LOOP:%.*]], label [[RET:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @loop.cond()
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  br i1 %cond0, label %loop, label %ret
+
+loop:
+  %phi.loop = phi float [ 0x7FF0000000000000, %entry ], [ %phi.loop, %loop ]
+  %loop.cond = call i1 @loop.cond()
+  br i1 %loop.cond, label %ret, label %loop
+
+ret:
+  %phi.ret = phi float [ 0.0, %entry ], [ %phi.loop, %loop ]
+  ret float %phi.ret
+}
+
+; fold to ret poison
+define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_1(i1 %cond0, float %unknown) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__recursive_phi_1
+; CHECK-SAME: (i1 [[COND0:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND0]], label [[LOOP:%.*]], label [[RET:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @loop.cond()
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret float poison
+;
+entry:
+  br i1 %cond0, label %loop, label %ret
+
+loop:
+  %phi.loop = phi float [ 0x7FF0000000000000, %entry ], [ %phi.loop, %loop ]
+  %loop.cond = call i1 @loop.cond()
+  br i1 %loop.cond, label %ret, label %loop
+
+ret:
+  %phi.ret = phi float [ 0x7FF0000000000000, %entry ], [ %phi.loop, %loop ]
+  ret float %phi.ret
+}
+
+; Should be able to fold inf initial values to poison
+define nofpclass(inf) float @ret_nofpclass_inf__phi_switch_repeated_predecessor(i32 %switch, float %unknown) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__phi_switch_repeated_predecessor
+; CHECK-SAME: (i32 [[SWITCH:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[SWITCH]], label [[RET:%.*]] [
+; CHECK-NEXT:    i32 0, label [[LOOP:%.*]]
+; CHECK-NEXT:    i32 1, label [[LOOP]]
+; CHECK-NEXT:    ]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_LOOP:%.*]] = phi float [ 0x7FF0000000000000, [[ENTRY:%.*]] ], [ 0x7FF0000000000000, [[ENTRY]] ], [ [[UNKNOWN]], [[LOOP]] ]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @loop.cond()
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[RET]], label [[LOOP]]
+; CHECK:       ret:
+; CHECK-NEXT:    [[PHI_RET:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[PHI_LOOP]], [[LOOP]] ]
+; CHECK-NEXT:    ret float [[PHI_RET]]
+;
+entry:
+  switch i32 %switch, label %ret [
+  i32 0, label %loop
+  i32 1, label %loop
+  ]
+
+loop:
+  %phi.loop = phi float [ 0x7FF0000000000000, %entry ], [ 0x7FF0000000000000, %entry ], [ %unknown, %loop ]
+  %loop.cond = call i1 @loop.cond()
+  br i1 %loop.cond, label %ret, label %loop
+
+ret:
+  %phi.ret = phi float [ 0.0, %entry ], [ %phi.loop, %loop ]
+  ret float %phi.ret
+}
+
+; Simplify to arithmetic.fence %x
+define nofpclass(inf) float @ret_nofpclass_inf__arithmetic_fence_select_pinf_rhs(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__arithmetic_fence_select_pinf_rhs
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[FENCE:%.*]] = call float @llvm.arithmetic.fence.f32(float [[X]])
+; CHECK-NEXT:    ret float [[FENCE]]
+;
+  %select = select i1 %cond, float %x, float 0x7FF0000000000000
+  %fence = call float @llvm.arithmetic.fence.f32(float %select)
+  ret float %fence
+}
+
+; Can simplify to %x
+define nofpclass(pinf) float @ret_nofpclass_pinf__minnum_pinf(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__minnum_pinf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call float @llvm.minnum.f32(float [[X]], float 0x7FF0000000000000)
+; CHECK-NEXT:    ret float [[MIN]]
+;
+  %min = call float @llvm.minnum.f32(float %x, float 0x7FF0000000000000)
+  ret float %min
+}
+
+; Can fold to -inf
+define nofpclass(pinf) float @ret_nofpclass_pinf__minnum_ninf(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(pinf) float @ret_nofpclass_pinf__minnum_ninf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float 0xFFF0000000000000
+;
+  %min = call float @llvm.minnum.f32(float %x, float 0xFFF0000000000000)
+  ret float %min
+}
+
+; Can simplify to %x
+define nofpclass(ninf) float @ret_nofpclass_ninf__maxnum_ninf(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__maxnum_ninf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    [[MAX:%.*]] = call float @llvm.maxnum.f32(float [[X]], float 0xFFF0000000000000)
+; CHECK-NEXT:    ret float [[MAX]]
+;
+  %max = call float @llvm.maxnum.f32(float %x, float 0xFFF0000000000000)
+  ret float %max
+}
+
+; Can fold to +inf
+define nofpclass(ninf) float @ret_nofpclass_ninf__maxnum_pinf(i1 %cond, float %x) {
+; CHECK-LABEL: define nofpclass(ninf) float @ret_nofpclass_ninf__maxnum_pinf
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
+; CHECK-NEXT:    ret float 0x7FF0000000000000
+;
+  %max = call float @llvm.maxnum.f32(float %x, float 0x7FF0000000000000)
+  ret float %max
+}
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
index 21c08dc23148b7..ae23a2e9ce3523 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
@@ -75,8 +75,8 @@ define i32 @test2(i32 %l86) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[L86_OFF:%.*]] = add i32 [[L86:%.*]], -1
 ; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i32 [[L86_OFF]], 24
-; CHECK-NEXT:    [[DOTNOT30:%.*]] = icmp ne i32 [[L86]], 25
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = zext i1 [[DOTNOT30]] to i32
+; CHECK-NEXT:    [[OR_COND23_NOT:%.*]] = icmp ne i32 [[L86]], 25
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND23_NOT]] to i32
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[SWITCH]], i32 0, i32 [[SPEC_SELECT]]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
new file mode 100644
index 00000000000000..29a64d70a3635a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S < %s -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve 2>&1 | FileCheck %s
+
+define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
+; CHECK-LABEL: define void @clamped_tc_8
+; CHECK: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> {{.*}}, ptr {{.*}}, i32 1, <vscale x 8 x i1> {{.*}})
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %p_out_tail.09 = phi ptr [ %dst, %entry ], [ %incdec.ptr, %for.body ]
+  %0 = shl nuw nsw i64 %indvars.iv, 3
+  %shr3 = lshr i64 %val, %0
+  %conv4 = trunc i64 %shr3 to i8
+  store i8 %conv4, ptr %p_out_tail.09, align 1
+  %incdec.ptr = getelementptr inbounds i8, ptr %p_out_tail.09, i64 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
index c1e27ca468f37a..2017797288c297 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-calls-libsystem-darwin.ll
@@ -195,7 +195,7 @@ for.end:
   ret void
 }
 
-declare float @atan2f(float) nounwind readnone
+declare float @atan2f(float, float) nounwind readnone
 define void @atan2f_v4f32(i64 %n, ptr noalias %y, ptr noalias %x) {
 ; CHECK-LABEL: @atan2f_v4f32(
 ; CHECK: call <4 x float> @_simd_atan2_f4(
@@ -208,7 +208,7 @@ for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
   %lv = load float, ptr %gep.y, align 4
-  %call = tail call float @atan2f(float %lv)
+  %call = tail call float @atan2f(float %lv, float %lv)
   %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
   store float %call, ptr %gep.x, align 4
   %iv.next = add i64 %iv, 1
@@ -219,7 +219,7 @@ for.end:
   ret void
 }
 
-declare double @atan2(double) nounwind readnone
+declare double @atan2(double, double) nounwind readnone
 define void @atan2_v2f64(i64 %n, ptr noalias %y, ptr noalias %x) {
 ; CHECK-LABEL: @atan2_v2f64(
 ; CHECK: call <2 x double> @_simd_atan2_d2(
@@ -232,7 +232,7 @@ for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
   %lv = load double, ptr %gep.y, align 4
-  %call = tail call double @atan2(double %lv)
+  %call = tail call double @atan2(double %lv, double %lv)
   %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
   store double %call, ptr %gep.x, align 4
   %iv.next = add i64 %iv, 1
@@ -387,7 +387,7 @@ for.end:
   ret void
 }
 
-declare float @powf(float) nounwind readnone
+declare float @powf(float, float) nounwind readnone
 define void @powf_v4f32(i64 %n, ptr noalias %y, ptr noalias %x) {
 ; CHECK-LABEL: @powf_v4f32(
 ; CHECK: call <4 x float> @_simd_pow_f4(
@@ -400,7 +400,7 @@ for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
   %lv = load float, ptr %gep.y, align 4
-  %call = tail call float @powf(float %lv)
+  %call = tail call float @powf(float %lv, float %lv)
   %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
   store float %call, ptr %gep.x, align 4
   %iv.next = add i64 %iv, 1
@@ -411,7 +411,7 @@ for.end:
   ret void
 }
 
-declare double @pow(double) nounwind readnone
+declare double @pow(double, double) nounwind readnone
 define void @pow_v2f64(i64 %n, ptr noalias %y, ptr noalias %x) {
 ; CHECK-LABEL: @pow_v2f64(
 ; CHECK: call <2 x double> @_simd_pow_d2(
@@ -424,7 +424,7 @@ for.body:
   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
   %gep.y = getelementptr inbounds double, ptr %y, i64 %iv
   %lv = load double, ptr %gep.y, align 4
-  %call = tail call double @pow(double %lv)
+  %call = tail call double @pow(double %lv, double %lv)
   %gep.x = getelementptr inbounds double, ptr %x, i64 %iv
   store double %call, ptr %gep.x, align 4
   %iv.next = add i64 %iv, 1
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll
index c2871dd6eeb22f..ba6e85e2852952 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll
@@ -65,8 +65,8 @@ declare float @acosf(float) #0
 declare double @atan(double) #0
 declare float @atanf(float) #0
 
-declare double @atan2(double) #0
-declare float @atan2f(float) #0
+declare double @atan2(double, double) #0
+declare float @atan2f(float, float) #0
 
 declare double @sinh(double) #0
 declare float @sinhf(float) #0
@@ -1210,7 +1210,7 @@ for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %tmp = trunc i64 %iv to i32
   %conv = sitofp i32 %tmp to double
-  %call = tail call double @atan2(double %conv)
+  %call = tail call double @atan2(double %conv, double %conv)
   %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
   store double %call, ptr %arrayidx, align 4
   %iv.next = add nuw nsw i64 %iv, 1
@@ -1233,7 +1233,7 @@ for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %tmp = trunc i64 %iv to i32
   %conv = sitofp i32 %tmp to float
-  %call = tail call float @atan2f(float %conv)
+  %call = tail call float @atan2f(float %conv, float %conv)
   %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
   store float %call, ptr %arrayidx, align 4
   %iv.next = add nuw nsw i64 %iv, 1
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
index 64eb35ebe85344..240882bab636dd 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
@@ -9,20 +9,20 @@ define dso_local double @test(ptr %Arr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP5]] = fadd fast <2 x double> [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4]] = fadd fast <2 x double> [[TMP3]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP5]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
-; CHECK-NEXT:    ret double [[TMP7]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP4]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
+; CHECK-NEXT:    ret double [[TMP6]]
 ;
 entry:
   br label %for.cond
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index d6196c84e8e2de..99f03cdcf648c0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -461,44 +461,21 @@ exit:
 define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
 ; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]]
 ; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 00cabd58de9139..fab924580cbb70 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -22,14 +22,14 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV32:       vector.memcheck:
-; RV32-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
-; RV32-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
-; RV32-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
-; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
-; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
+; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
+; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
+; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
+; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; RV32-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]]
-; RV32-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]]
+; RV32-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
+; RV32-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
 ; RV32-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
@@ -104,14 +104,14 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV64:       vector.memcheck:
-; RV64-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
-; RV64-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
-; RV64-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
-; RV64-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[UGLYGEP1]]
-; RV64-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[UGLYGEP]]
+; RV64-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
+; RV64-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
+; RV64-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
+; RV64-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; RV64-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
 ; RV64-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; RV64-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[UGLYGEP2]]
-; RV64-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[UGLYGEP]]
+; RV64-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
+; RV64-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
 ; RV64-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
 ; RV64-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
 ; RV64-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 31d0b2abfe18ab..38ef0537e43183 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -6,21 +6,32 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -56,21 +67,32 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LowerTypeTests/import.ll b/llvm/test/Transforms/LowerTypeTests/import.ll
index abe85460ab5bb6..1eff4bbbbdf971 100644
--- a/llvm/test/Transforms/LowerTypeTests/import.ll
+++ b/llvm/test/Transforms/LowerTypeTests/import.ll
@@ -37,11 +37,13 @@ define i1 @allones7(ptr %p) {
 ; X86-SAME: ptr [[P:%.*]]) {
 ; X86-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; X86-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], ptrtoint (ptr @__typeid_allones7_global_addr to i64)
-; X86-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], zext (i8 ptrtoint (ptr @__typeid_allones7_align to i8) to i64)
-; X86-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP2]], zext (i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_allones7_align to i8)) to i64)
-; X86-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = icmp ule i64 [[TMP5]], ptrtoint (ptr @__typeid_allones7_size_m1 to i64)
-; X86-NEXT:    ret i1 [[TMP6]]
+; X86-NEXT:    [[TMP3:%.*]] = zext i8 ptrtoint (ptr @__typeid_allones7_align to i8) to i64
+; X86-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], [[TMP3]]
+; X86-NEXT:    [[TMP5:%.*]] = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_allones7_align to i8)) to i64
+; X86-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = or i64 [[TMP4]], [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = icmp ule i64 [[TMP7]], ptrtoint (ptr @__typeid_allones7_size_m1 to i64)
+; X86-NEXT:    ret i1 [[TMP8]]
 ;
 ; ARM-LABEL: define i1 @allones7(
 ; ARM-SAME: ptr [[P:%.*]]) {
@@ -62,11 +64,13 @@ define i1 @allones32(ptr %p) {
 ; X86-SAME: ptr [[P:%.*]]) {
 ; X86-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; X86-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], ptrtoint (ptr @__typeid_allones32_global_addr to i64)
-; X86-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], zext (i8 ptrtoint (ptr @__typeid_allones32_align to i8) to i64)
-; X86-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP2]], zext (i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_allones32_align to i8)) to i64)
-; X86-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = icmp ule i64 [[TMP5]], ptrtoint (ptr @__typeid_allones32_size_m1 to i64)
-; X86-NEXT:    ret i1 [[TMP6]]
+; X86-NEXT:    [[TMP3:%.*]] = zext i8 ptrtoint (ptr @__typeid_allones32_align to i8) to i64
+; X86-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], [[TMP3]]
+; X86-NEXT:    [[TMP5:%.*]] = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_allones32_align to i8)) to i64
+; X86-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = or i64 [[TMP4]], [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = icmp ule i64 [[TMP7]], ptrtoint (ptr @__typeid_allones32_size_m1 to i64)
+; X86-NEXT:    ret i1 [[TMP8]]
 ;
 ; ARM-LABEL: define i1 @allones32(
 ; ARM-SAME: ptr [[P:%.*]]) {
@@ -87,20 +91,22 @@ define i1 @bytearray7(ptr %p) {
 ; X86-SAME: ptr [[P:%.*]]) {
 ; X86-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; X86-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], ptrtoint (ptr @__typeid_bytearray7_global_addr to i64)
-; X86-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], zext (i8 ptrtoint (ptr @__typeid_bytearray7_align to i8) to i64)
-; X86-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP2]], zext (i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_bytearray7_align to i8)) to i64)
-; X86-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = icmp ule i64 [[TMP5]], ptrtoint (ptr @__typeid_bytearray7_size_m1 to i64)
-; X86-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP12:%.*]]
-; X86:       7:
-; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr @__typeid_bytearray7_byte_array, i64 [[TMP5]]
-; X86-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
-; X86-NEXT:    [[TMP10:%.*]] = and i8 [[TMP9]], ptrtoint (ptr @__typeid_bytearray7_bit_mask to i8)
-; X86-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
-; X86-NEXT:    br label [[TMP12]]
-; X86:       12:
-; X86-NEXT:    [[TMP13:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[TMP11]], [[TMP7]] ]
-; X86-NEXT:    ret i1 [[TMP13]]
+; X86-NEXT:    [[TMP3:%.*]] = zext i8 ptrtoint (ptr @__typeid_bytearray7_align to i8) to i64
+; X86-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], [[TMP3]]
+; X86-NEXT:    [[TMP5:%.*]] = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_bytearray7_align to i8)) to i64
+; X86-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = or i64 [[TMP4]], [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = icmp ule i64 [[TMP7]], ptrtoint (ptr @__typeid_bytearray7_size_m1 to i64)
+; X86-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP14:%.*]]
+; X86:       9:
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr @__typeid_bytearray7_byte_array, i64 [[TMP7]]
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = and i8 [[TMP11]], ptrtoint (ptr @__typeid_bytearray7_bit_mask to i8)
+; X86-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP12]], 0
+; X86-NEXT:    br label [[TMP14]]
+; X86:       14:
+; X86-NEXT:    [[TMP15:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[TMP13]], [[TMP9]] ]
+; X86-NEXT:    ret i1 [[TMP15]]
 ;
 ; ARM-LABEL: define i1 @bytearray7(
 ; ARM-SAME: ptr [[P:%.*]]) {
@@ -130,20 +136,22 @@ define i1 @bytearray32(ptr %p) {
 ; X86-SAME: ptr [[P:%.*]]) {
 ; X86-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; X86-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], ptrtoint (ptr @__typeid_bytearray32_global_addr to i64)
-; X86-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], zext (i8 ptrtoint (ptr @__typeid_bytearray32_align to i8) to i64)
-; X86-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP2]], zext (i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_bytearray32_align to i8)) to i64)
-; X86-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = icmp ule i64 [[TMP5]], ptrtoint (ptr @__typeid_bytearray32_size_m1 to i64)
-; X86-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP12:%.*]]
-; X86:       7:
-; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr @__typeid_bytearray32_byte_array, i64 [[TMP5]]
-; X86-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
-; X86-NEXT:    [[TMP10:%.*]] = and i8 [[TMP9]], ptrtoint (ptr @__typeid_bytearray32_bit_mask to i8)
-; X86-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
-; X86-NEXT:    br label [[TMP12]]
-; X86:       12:
-; X86-NEXT:    [[TMP13:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[TMP11]], [[TMP7]] ]
-; X86-NEXT:    ret i1 [[TMP13]]
+; X86-NEXT:    [[TMP3:%.*]] = zext i8 ptrtoint (ptr @__typeid_bytearray32_align to i8) to i64
+; X86-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], [[TMP3]]
+; X86-NEXT:    [[TMP5:%.*]] = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_bytearray32_align to i8)) to i64
+; X86-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = or i64 [[TMP4]], [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = icmp ule i64 [[TMP7]], ptrtoint (ptr @__typeid_bytearray32_size_m1 to i64)
+; X86-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP14:%.*]]
+; X86:       9:
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr @__typeid_bytearray32_byte_array, i64 [[TMP7]]
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = and i8 [[TMP11]], ptrtoint (ptr @__typeid_bytearray32_bit_mask to i8)
+; X86-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP12]], 0
+; X86-NEXT:    br label [[TMP14]]
+; X86:       14:
+; X86-NEXT:    [[TMP15:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[TMP13]], [[TMP9]] ]
+; X86-NEXT:    ret i1 [[TMP15]]
 ;
 ; ARM-LABEL: define i1 @bytearray32(
 ; ARM-SAME: ptr [[P:%.*]]) {
@@ -173,21 +181,23 @@ define i1 @inline5(ptr %p) {
 ; X86-SAME: ptr [[P:%.*]]) {
 ; X86-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; X86-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], ptrtoint (ptr @__typeid_inline5_global_addr to i64)
-; X86-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], zext (i8 ptrtoint (ptr @__typeid_inline5_align to i8) to i64)
-; X86-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP2]], zext (i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_inline5_align to i8)) to i64)
-; X86-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = icmp ule i64 [[TMP5]], ptrtoint (ptr @__typeid_inline5_size_m1 to i64)
-; X86-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP13:%.*]]
-; X86:       7:
-; X86-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32
-; X86-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], 31
-; X86-NEXT:    [[TMP10:%.*]] = shl i32 1, [[TMP9]]
-; X86-NEXT:    [[TMP11:%.*]] = and i32 ptrtoint (ptr @__typeid_inline5_inline_bits to i32), [[TMP10]]
-; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
-; X86-NEXT:    br label [[TMP13]]
-; X86:       13:
-; X86-NEXT:    [[TMP14:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[TMP12]], [[TMP7]] ]
-; X86-NEXT:    ret i1 [[TMP14]]
+; X86-NEXT:    [[TMP3:%.*]] = zext i8 ptrtoint (ptr @__typeid_inline5_align to i8) to i64
+; X86-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], [[TMP3]]
+; X86-NEXT:    [[TMP5:%.*]] = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_inline5_align to i8)) to i64
+; X86-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = or i64 [[TMP4]], [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = icmp ule i64 [[TMP7]], ptrtoint (ptr @__typeid_inline5_size_m1 to i64)
+; X86-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP15:%.*]]
+; X86:       9:
+; X86-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP7]] to i32
+; X86-NEXT:    [[TMP11:%.*]] = and i32 [[TMP10]], 31
+; X86-NEXT:    [[TMP12:%.*]] = shl i32 1, [[TMP11]]
+; X86-NEXT:    [[TMP13:%.*]] = and i32 ptrtoint (ptr @__typeid_inline5_inline_bits to i32), [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+; X86-NEXT:    br label [[TMP15]]
+; X86:       15:
+; X86-NEXT:    [[TMP16:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[TMP14]], [[TMP9]] ]
+; X86-NEXT:    ret i1 [[TMP16]]
 ;
 ; ARM-LABEL: define i1 @inline5(
 ; ARM-SAME: ptr [[P:%.*]]) {
@@ -218,20 +228,22 @@ define i1 @inline6(ptr %p) {
 ; X86-SAME: ptr [[P:%.*]]) {
 ; X86-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; X86-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], ptrtoint (ptr @__typeid_inline6_global_addr to i64)
-; X86-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], zext (i8 ptrtoint (ptr @__typeid_inline6_align to i8) to i64)
-; X86-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP2]], zext (i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_inline6_align to i8)) to i64)
-; X86-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = icmp ule i64 [[TMP5]], ptrtoint (ptr @__typeid_inline6_size_m1 to i64)
-; X86-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP12:%.*]]
-; X86:       7:
-; X86-NEXT:    [[TMP8:%.*]] = and i64 [[TMP5]], 63
-; X86-NEXT:    [[TMP9:%.*]] = shl i64 1, [[TMP8]]
-; X86-NEXT:    [[TMP10:%.*]] = and i64 ptrtoint (ptr @__typeid_inline6_inline_bits to i64), [[TMP9]]
-; X86-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
-; X86-NEXT:    br label [[TMP12]]
-; X86:       12:
-; X86-NEXT:    [[TMP13:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[TMP11]], [[TMP7]] ]
-; X86-NEXT:    ret i1 [[TMP13]]
+; X86-NEXT:    [[TMP3:%.*]] = zext i8 ptrtoint (ptr @__typeid_inline6_align to i8) to i64
+; X86-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], [[TMP3]]
+; X86-NEXT:    [[TMP5:%.*]] = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_inline6_align to i8)) to i64
+; X86-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = or i64 [[TMP4]], [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = icmp ule i64 [[TMP7]], ptrtoint (ptr @__typeid_inline6_size_m1 to i64)
+; X86-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP14:%.*]]
+; X86:       9:
+; X86-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 63
+; X86-NEXT:    [[TMP11:%.*]] = shl i64 1, [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = and i64 ptrtoint (ptr @__typeid_inline6_inline_bits to i64), [[TMP11]]
+; X86-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP12]], 0
+; X86-NEXT:    br label [[TMP14]]
+; X86:       14:
+; X86-NEXT:    [[TMP15:%.*]] = phi i1 [ false, [[TMP0:%.*]] ], [ [[TMP13]], [[TMP9]] ]
+; X86-NEXT:    ret i1 [[TMP15]]
 ;
 ; ARM-LABEL: define i1 @inline6(
 ; ARM-SAME: ptr [[P:%.*]]) {
diff --git a/llvm/test/Transforms/LowerTypeTests/simplify.ll b/llvm/test/Transforms/LowerTypeTests/simplify.ll
index 3ad856d9bb24b8..ff0f941eece993 100644
--- a/llvm/test/Transforms/LowerTypeTests/simplify.ll
+++ b/llvm/test/Transforms/LowerTypeTests/simplify.ll
@@ -11,17 +11,19 @@ define i1 @bytearray7(ptr %p) {
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], ptrtoint (ptr @__typeid_bytearray7_global_addr to i64)
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], zext (i8 ptrtoint (ptr @__typeid_bytearray7_align to i8) to i64)
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP2]], zext (i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_bytearray7_align to i8)) to i64)
-; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule i64 [[TMP5]], ptrtoint (ptr @__typeid_bytearray7_size_m1 to i64)
-; CHECK-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[F:%.*]]
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr @__typeid_bytearray7_byte_array, i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = and i8 [[TMP9]], ptrtoint (ptr @__typeid_bytearray7_bit_mask to i8)
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
-; CHECK-NEXT:    br i1 [[TMP11]], label [[T:%.*]], label [[F]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 ptrtoint (ptr @__typeid_bytearray7_align to i8) to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 sub (i8 64, i8 ptrtoint (ptr @__typeid_bytearray7_align to i8)) to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule i64 [[TMP7]], ptrtoint (ptr @__typeid_bytearray7_size_m1 to i64)
+; CHECK-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[F:%.*]]
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr @__typeid_bytearray7_byte_array, i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = and i8 [[TMP11]], ptrtoint (ptr @__typeid_bytearray7_bit_mask to i8)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[T:%.*]], label [[F]]
 ; CHECK:       t:
 ; CHECK-NEXT:    ret i1 true
 ; CHECK:       f:
diff --git a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
index 28464c795ff239..fc85f57e020c10 100644
--- a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
@@ -11,7 +11,7 @@ define void @form_memcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b)
 ; CHECK-NEXT:    br label %"bb1+bb2"
 ; CHECK:       "bb1+bb2":
 ; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B]], i64 16)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
index b3884368ef04e7..8d41d9a486801f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -94,7 +94,7 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I9]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[C:%.*]] = icmp slt i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT11_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT11]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
new file mode 100644
index 00000000000000..0783a28f56d850
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @p(double %0) {
+; CHECK-LABEL: define void @p(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr null, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %mul16.1.i = fmul double %0, 0.000000e+00
+  %add21.1.i = fadd double %mul16.1.i, 0.000000e+00
+  %add21.2.i = fadd double %add21.1.i, 0.000000e+00
+  %mul16.150.i = fmul double 0.000000e+00, 0.000000e+00
+  %add21.1.1.i = fadd double 0.000000e+00, %mul16.150.i
+  %add21.2.1.i = fadd double %add21.1.1.i, 0.000000e+00
+  %mul16.252.i = fmul double 0.000000e+00, 0.000000e+00
+  %add21.1.2.i = fadd double 0.000000e+00, %mul16.252.i
+  %add21.2.2.i = fadd double %add21.1.2.i, 0.000000e+00
+  %add21.2.165.i = fadd double %add21.1.i, 0.000000e+00
+  %mul16.150.1.i = fmul double 0.000000e+00, 0.000000e+00
+  %add21.1.1.1.i = fadd double %mul16.150.1.i, 0.000000e+00
+  %add21.2.1.1.i = fadd double %add21.1.1.1.i, 0.000000e+00
+  %add21.2.2.1.i = fadd double 0.000000e+00, %mul16.150.1.i
+  %mul16.1.1.i36 = fmul double %add21.2.1.1.i, 0.000000e+00
+  %add21.1.1.i37 = fadd double 0.000000e+00, %mul16.1.1.i36
+  %add21.2.1.i40 = fadd double %add21.1.1.i37, 0.000000e+00
+  %mul16.252.i43 = fmul double %add21.2.2.i, 0.000000e+00
+  %mul16.1.2.i45 = fmul double %add21.2.2.1.i, 0.000000e+00
+  %add21.1.2.i46 = fadd double %mul16.252.i43, %mul16.1.2.i45
+  %add21.2.2.i49 = fadd double %add21.1.2.i46, 0.000000e+00
+  %mul16.157.i51 = fmul double %add21.2.i, 0.000000e+00
+  %mul16.1.160.i52 = fmul double %add21.2.165.i, 0.000000e+00
+  %add21.1.161.i53 = fadd double %mul16.157.i51, %mul16.1.160.i52
+  %add21.2.165.i56 = fadd double %add21.1.161.i53, 0.000000e+00
+  %mul16.150.1.i58 = fmul double %add21.2.1.i, 0.000000e+00
+  %add21.1.1.1.i60 = fadd double %mul16.150.1.i58, 0.000000e+00
+  %add21.2.1.1.i62 = fadd double %add21.1.1.1.i60, 0.000000e+00
+  %conv14.1 = fptosi double %add21.2.1.i40 to i32
+  %arrayidx16.1 = getelementptr i32, ptr null, i64 1
+  store i32 %conv14.1, ptr %arrayidx16.1, align 4
+  %conv14.2 = fptosi double %add21.2.2.i49 to i32
+  %arrayidx16.2 = getelementptr i32, ptr null, i64 2
+  store i32 %conv14.2, ptr %arrayidx16.2, align 4
+  %conv14.3 = fptosi double %add21.2.165.i56 to i32
+  %arrayidx16.3 = getelementptr i32, ptr null, i64 3
+  store i32 %conv14.3, ptr %arrayidx16.3, align 4
+  %conv14.4 = fptosi double %add21.2.1.1.i62 to i32
+  store i32 %conv14.4, ptr null, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
index 47347719d373a0..111ca8abf8122f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
@@ -11,15 +11,15 @@ define void @splat_loads_double(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB)
 ; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
 ; CHECK-NEXT:    [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
 ; CHECK-NEXT:    [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[ARRAY1]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[ARRAY1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -52,15 +52,15 @@ define void @splat_loads_float(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds float, ptr [[ARRAY2:%.*]], i64 1
 ; CHECK-NEXT:    [[LD_2_0:%.*]] = load float, ptr [[ARRAY2]], align 8
 ; CHECK-NEXT:    [[LD_2_1:%.*]] = load float, ptr [[GEP_2_1]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[LD_2_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAY1:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[LD_2_1]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP1]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[ARRAY1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP0]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[ARRAY1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -93,15 +93,15 @@ define void @splat_loads_i64(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i64, ptr [[ARRAY2:%.*]], i64 1
 ; CHECK-NEXT:    [[LD_2_0:%.*]] = load i64, ptr [[ARRAY2]], align 8
 ; CHECK-NEXT:    [[LD_2_1:%.*]] = load i64, ptr [[GEP_2_1]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ARRAY1:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP1]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[ARRAY1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -134,15 +134,15 @@ define void @splat_loads_i32(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY2:%.*]], i64 1
 ; CHECK-NEXT:    [[LD_2_0:%.*]] = load i32, ptr [[ARRAY2]], align 8
 ; CHECK-NEXT:    [[LD_2_1:%.*]] = load i32, ptr [[GEP_2_1]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAY1:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i32> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i32> [[TMP1]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[ARRAY1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i32> [[TMP0]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    store <2 x i32> [[TMP7]], ptr [[ARRAY1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/ARM/insert-into-small-vector.ll
new file mode 100644
index 00000000000000..803ae70f8aa295
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/insert-into-small-vector.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S --passes=slp-vectorizer -mtriple=thumbv7-unknown-linux-android24 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt float [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[V14_0:%.*]] = select i1 [[CMP_I]], float [[TMP1]], float 0.000000e+00
+; CHECK-NEXT:    [[V0_0:%.*]] = select i1 [[CMP_I]], float [[TMP0]], float 0.000000e+00
+; CHECK-NEXT:    [[CMP4_I:%.*]] = fcmp ogt float 0.000000e+00, [[TMP2]]
+; CHECK-NEXT:    [[V19_0:%.*]] = select i1 [[CMP4_I]], float 0.000000e+00, float 0.000000e+00
+; CHECK-NEXT:    [[V9_0:%.*]] = select i1 [[CMP4_I]], float [[TMP2]], float 0.000000e+00
+; CHECK-NEXT:    store float [[V0_0]], ptr null, align 4
+; CHECK-NEXT:    [[V9IDX:%.*]] = getelementptr i8, ptr null, i32 4
+; CHECK-NEXT:    store float [[V9_0]], ptr [[V9IDX]], align 4
+; CHECK-NEXT:    [[V14IDX:%.*]] = getelementptr i8, ptr null, i32 8
+; CHECK-NEXT:    store float [[V14_0]], ptr [[V14IDX]], align 4
+; CHECK-NEXT:    [[V19IDX:%.*]] = getelementptr i8, ptr null, i32 12
+; CHECK-NEXT:    store float [[V19_0]], ptr [[V19IDX]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load float, ptr null, align 4
+  %1 = load float, ptr null, align 4
+  %2 = load float, ptr null, align 4
+  %cmp.i = fcmp ogt float %1, %0
+  %v14.0 = select i1 %cmp.i, float %1, float 0.000000e+00
+  %v0.0 = select i1 %cmp.i, float %0, float 0.000000e+00
+  %cmp4.i = fcmp ogt float 0.000000e+00, %2
+  %v19.0 = select i1 %cmp4.i, float 0.000000e+00, float 0.000000e+00
+  %v9.0 = select i1 %cmp4.i, float %2, float 0.000000e+00
+  store float %v0.0, ptr null, align 4
+  %v9idx = getelementptr i8, ptr null, i32 4
+  store float %v9.0, ptr %v9idx, align 4
+  %v14idx = getelementptr i8, ptr null, i32 8
+  store float %v14.0, ptr %v14idx, align 4
+  %v19idx = getelementptr i8, ptr null, i32 12
+  store float %v19.0, ptr %v19idx, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
index 178a67e35a33f2..d1dd57fac49587 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
@@ -8,10 +8,10 @@ declare i64 @may_throw() willreturn
 ; Base case with no interesting control dependencies
 define void @test_no_control(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test_no_control(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -34,11 +34,11 @@ define void @test1(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -60,11 +60,11 @@ define void @test2(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %c1 = load i64, ptr %c
@@ -87,11 +87,11 @@ define void @test3(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -113,11 +113,11 @@ define void @test4(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -139,11 +139,11 @@ define void @test5(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %a2 = getelementptr i64, ptr %a, i32 1
@@ -164,10 +164,10 @@ define void @test5(ptr %a, ptr %b, ptr %c) {
 define void @test6(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -200,11 +200,11 @@ define void @test7(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -233,11 +233,11 @@ define void @test8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -266,11 +266,11 @@ define void @test9(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw()
 ; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -301,11 +301,11 @@ define void @test10(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    store i64 [[U1]], ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[V2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -337,11 +337,11 @@ define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) {
 ; CHECK-NEXT:    store i64 [[U1]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[B]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %u1 = udiv i64 200, %x
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extracts.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extracts.ll
new file mode 100644
index 00000000000000..1820407daf0785
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extracts.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-pc-windows-msvc19.34.0 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i1> poison, i1 false, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([17 x i64], ptr null, i64 0, i64 9), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i1> [[TMP0]], i1 false, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i1> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr getelementptr inbounds ([17 x i64], ptr null, i64 0, i64 9), align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp.i.i = fcmp olt float 0.000000e+00, 0.000000e+00
+  %0 = zext i1 %cmp.i.i to i64
+  %pgocount88 = load i64, ptr getelementptr inbounds ([17 x i64], ptr null, i64 0, i64 9), align 8
+  %1 = or i64 %pgocount88, %0
+  store i64 %1, ptr getelementptr inbounds ([17 x i64], ptr null, i64 0, i64 9), align 8
+  %cond.i.i = select i1 %cmp.i.i, float 0.000000e+00, float 0.000000e+00
+  %cmp1.i.i = fcmp ogt float %cond.i.i, 0.000000e+00
+  %2 = zext i1 %cmp1.i.i to i64
+  %pgocount89 = load i64, ptr getelementptr inbounds ([17 x i64], ptr null, i64 0, i64 10), align 8
+  %3 = or i64 %pgocount89, %2
+  store i64 %3, ptr getelementptr inbounds ([17 x i64], ptr null, i64 0, i64 10), align 8
+  %cmp.i9.i = fcmp olt float 0.000000e+00, 0.000000e+00
+  %cond.i10.i = select i1 %cmp.i9.i, float 0.000000e+00, float 0.000000e+00
+  %cmp1.i11.i = fcmp ogt float %cond.i10.i, 0.000000e+00
+  %cmp.i14.i = fcmp olt float 0.000000e+00, 0.000000e+00
+  %cond.i15.i = select i1 %cmp.i14.i, float 0.000000e+00, float 0.000000e+00
+  %cmp1.i16.i = fcmp ogt float %cond.i15.i, 0.000000e+00
+  %cmp.i19.i = fcmp olt float 0.000000e+00, 0.000000e+00
+  %cond.i20.i = select i1 %cmp.i19.i, float 0.000000e+00, float 0.000000e+00
+  %cmp1.i21.i = fcmp ogt float %cond.i20.i, 0.000000e+00
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
index c5981c44d96dd9..eb3d395f4c6a6f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -241,21 +241,10 @@ define void @tiny_vector_gather(ptr %a, ptr %v1, ptr %v2) {
 ; CHECK-LABEL: @tiny_vector_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V1:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[V2:%.*]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[A:%.*]], align 16
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 1
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[PTR1]], align 4
-; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[PTR2]], align 8
-; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[PTR3]], align 4
-; CHECK-NEXT:    [[PTR4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[PTR4]], align 16
-; CHECK-NEXT:    [[PTR5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 5
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[PTR5]], align 4
-; CHECK-NEXT:    [[PTR6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 6
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[PTR6]], align 8
-; CHECK-NEXT:    [[PTR7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 7
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[PTR7]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[A:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %1 = load i32, ptr %v1, align 4
diff --git a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
index 97cbc2e27067bd..7fbd4506bea78c 100644
--- a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
 ; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
 
@@ -9,32 +10,36 @@
 
 ; setjmp/longjmp test with dynamically sized array.
 ; Requires protector.
-; CHECK: @foo(i32 %[[ARG:.*]])
 define i32 @foo(i32 %size) nounwind uwtable safestack {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
+; CHECK-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[SIZE]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -16
+; CHECK-NEXT:    [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8
+; CHECK-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; CHECK-NEXT:    store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8
+; CHECK-NEXT:    call void @funcall(ptr [[A]])
+; CHECK-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8
+; CHECK-NEXT:    ret i32 0
+;
 entry:
-  ; CHECK: %[[SP:.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr
-  ; CHECK-NEXT: %[[DYNPTR:.*]] = alloca ptr
-  ; CHECK-NEXT: store ptr %[[SP]], ptr %[[DYNPTR]]
 
-  ; CHECK-NEXT: %[[ZEXT:.*]] = zext i32 %[[ARG]] to i64
-  ; CHECK-NEXT: %[[MUL:.*]] = mul i64 %[[ZEXT]], 4
-  ; CHECK-NEXT: %[[SP2:.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr
-  ; CHECK-NEXT: %[[PTRTOINT:.*]] = ptrtoint ptr %[[SP2]] to i64
-  ; CHECK-NEXT: %[[SUB:.*]] = sub i64 %[[PTRTOINT]], %[[MUL]]
-  ; CHECK-NEXT: %[[AND:.*]] = and i64 %[[SUB]], -16
-  ; CHECK-NEXT: %[[INTTOPTR:.*]] = inttoptr i64 %[[AND]] to ptr
-  ; CHECK-NEXT: store ptr %[[INTTOPTR]], ptr @__safestack_unsafe_stack_ptr
-  ; CHECK-NEXT: store ptr %[[INTTOPTR]], ptr %unsafe_stack_dynamic_ptr
   %a = alloca i32, i32 %size
 
-  ; CHECK: setjmp
-  ; CHECK-NEXT: %[[LOAD:.*]] = load ptr, ptr %[[DYNPTR]]
-  ; CHECK-NEXT: store ptr %[[LOAD]], ptr @__safestack_unsafe_stack_ptr
   %call = call i32 @_setjmp(ptr @buf) returns_twice
 
-  ; CHECK: call void @funcall(ptr %[[INTTOPTR]])
   call void @funcall(ptr %a)
-  ; CHECK-NEXT: store ptr %[[SP:.*]], ptr @__safestack_unsafe_stack_ptr
   ret i32 0
 }
 
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index 5a76104c7a65f2..5ae4ea9dad7de0 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt < %s -mtriple=nvptx64-nvidia-cuda -passes=separate-const-offset-from-gep \
 ; RUN:       -reassociate-geps-verify-no-dead-code -S | FileCheck %s
 
@@ -13,19 +14,35 @@
 ; We should not extract any struct field indices, because fields in a struct
 ; may have different types.
 define ptr @struct(i32 %i) {
+; CHECK-LABEL: define ptr @struct(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [1024 x %struct.S], ptr @struct_array, i64 0, i64 [[TMP0]], i32 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 10
+; CHECK-NEXT:    ret ptr [[P2]]
+;
 entry:
   %add = add nsw i32 %i, 5
   %idxprom = sext i32 %add to i64
   %p = getelementptr inbounds [1024 x %struct.S], ptr @struct_array, i64 0, i64 %idxprom, i32 1
   ret ptr %p
 }
-; CHECK-LABEL: @struct(
-; CHECK: getelementptr [1024 x %struct.S], ptr @struct_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1
 
 ; We should be able to trace into sext(a + b) if a + b is non-negative
 ; (e.g., used as an index of an inbounds GEP) and one of a and b is
 ; non-negative.
 define ptr @sext_add(i32 %i, i32 %j) {
+; CHECK-LABEL: define ptr @sext_add(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[J:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[J]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[TMP2]], i64 [[TMP1]]
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 32
+; CHECK-NEXT:    ret ptr [[P1]]
+;
 entry:
   %0 = add i32 %i, 1
   %1 = sext i32 %0 to i64  ; inbound sext(i + 1) = sext(i) + 1
@@ -35,12 +52,6 @@ entry:
   %p = getelementptr inbounds [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %1, i64 %3
   ret ptr %p
 }
-; CHECK-LABEL: @sext_add(
-; CHECK-NOT: = add
-; CHECK: add i32 %j, -2
-; CHECK: sext
-; CHECK: getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr inbounds float, ptr %{{[a-zA-Z0-9]+}}, i64 32
 
 ; We should be able to trace into sext/zext if it can be distributed to both
 ; operands, e.g., sext (add nsw a, b) == add nsw (sext a), (sext b)
@@ -50,6 +61,16 @@ entry:
 ; to
 ;   gep base, a + sext(b), c + zext(d); gep ..., 1 * 32 + 1
 define ptr @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) {
+; CHECK-LABEL: define ptr @ext_add_no_overflow(
+; CHECK-SAME: i64 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[I2:%.*]] = add i64 [[A]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[D]] to i64
+; CHECK-NEXT:    [[J4:%.*]] = add i64 [[C]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 [[J4]]
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 33
+; CHECK-NEXT:    ret ptr [[P5]]
+;
   %b1 = add nsw i32 %b, 1
   %b2 = sext i32 %b1 to i64
   %i = add i64 %a, %b2       ; i = a + sext(b +nsw 1)
@@ -59,12 +80,30 @@ define ptr @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) {
   %p = getelementptr inbounds [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %i, i64 %j
   ret ptr %p
 }
-; CHECK-LABEL: @ext_add_no_overflow(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr inbounds float, ptr [[BASE_PTR]], i64 33
 
 ; Verifies we handle nested sext/zext correctly.
 define void @sext_zext(i32 %a, i32 %b, ptr %out1, ptr %out2) {
+; CHECK-LABEL: define void @sext_zext(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], ptr [[OUT1:%.*]], ptr [[OUT2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[B]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i48
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i48 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[A]] to i48
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[TMP4]], i64 [[TMP2]]
+; CHECK-NEXT:    [[P11:%.*]] = getelementptr float, ptr [[TMP5]], i64 32
+; CHECK-NEXT:    store ptr [[P11]], ptr [[OUT1]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i32 [[B]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i48
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i48 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[A]] to i48
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i48 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[TMP10]], i64 [[TMP8]]
+; CHECK-NEXT:    [[P22:%.*]] = getelementptr float, ptr [[TMP11]], i64 96
+; CHECK-NEXT:    store ptr [[P22]], ptr [[OUT2]], align 8
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = add nsw nuw i32 %a, 1
   %1 = sext i32 %0 to i48
@@ -84,15 +123,23 @@ entry:
   store ptr %p2, ptr %out2
   ret void
 }
-; CHECK-LABEL: @sext_zext(
-; CHECK: [[BASE_PTR_1:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, ptr [[BASE_PTR_1]], i64 32
-; CHECK: [[BASE_PTR_2:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, ptr [[BASE_PTR_2]], i64 96
 
 ; Similar to @ext_add_no_overflow, we should be able to trace into s/zext if
 ; its operand is an OR and the two operands of the OR have no common bits.
 define ptr @sext_or(i64 %a, i32 %b) {
+; CHECK-LABEL: define ptr @sext_or(
+; CHECK-SAME: i64 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B1:%.*]] = shl i32 [[B]], 2
+; CHECK-NEXT:    [[B3:%.*]] = or i32 [[B1]], 4
+; CHECK-NEXT:    [[B3_EXT:%.*]] = sext i32 [[B3]] to i64
+; CHECK-NEXT:    [[J:%.*]] = add i64 [[A]], [[B3_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[B1]] to i64
+; CHECK-NEXT:    [[I2:%.*]] = add i64 [[A]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 [[J]]
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 32
+; CHECK-NEXT:    ret ptr [[P3]]
+;
 entry:
   %b1 = shl i32 %b, 2
   %b2 = or i32 %b1, 1 ; (b << 2) and 1 have no common bits
@@ -104,14 +151,21 @@ entry:
   %p = getelementptr inbounds [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %i, i64 %j
   ret ptr %p
 }
-; CHECK-LABEL: @sext_or(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr inbounds float, ptr [[BASE_PTR]], i64 32
 
 ; The subexpression (b + 5) is used in both "i = a + (b + 5)" and "*out = b +
 ; 5". When extracting the constant offset 5, make sure "*out = b + 5" isn't
 ; affected.
 define ptr @expr(i64 %a, i64 %b, ptr %out) {
+; CHECK-LABEL: define ptr @expr(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]], ptr [[OUT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B5:%.*]] = add i64 [[B]], 5
+; CHECK-NEXT:    [[I2:%.*]] = add i64 [[B]], [[A]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 0
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 160
+; CHECK-NEXT:    store i64 [[B5]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret ptr [[P3]]
+;
 entry:
   %b5 = add i64 %b, 5
   %i = add i64 %b5, %a
@@ -119,13 +173,22 @@ entry:
   store i64 %b5, ptr %out
   ret ptr %p
 }
-; CHECK-LABEL: @expr(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 0
-; CHECK: getelementptr inbounds float, ptr [[BASE_PTR]], i64 160
-; CHECK: store i64 %b5, ptr %out
 
 ; d + sext(a +nsw (b +nsw (c +nsw 8))) => (d + sext(a) + sext(b) + sext(c)) + 8
 define ptr @sext_expr(i32 %a, i32 %b, i32 %c, i64 %d) {
+; CHECK-LABEL: define ptr @sext_expr(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i64 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[C]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[I1:%.*]] = add i64 [[D]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 [[I1]]
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 8
+; CHECK-NEXT:    ret ptr [[P2]]
+;
 entry:
   %0 = add nsw i32 %c, 8
   %1 = add nsw i32 %b, %0
@@ -135,28 +198,36 @@ entry:
   %p = getelementptr inbounds [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 %i
   ret ptr %p
 }
-; CHECK-LABEL: @sext_expr(
-; CHECK: sext i32
-; CHECK: sext i32
-; CHECK: sext i32
-; CHECK: getelementptr inbounds float, ptr %{{[a-zA-Z0-9]+}}, i64 8
 
 ; Verifies we handle "sub" correctly.
 define ptr @sub(i64 %i, i64 %j) {
+; CHECK-LABEL: define ptr @sub(
+; CHECK-SAME: i64 [[I:%.*]], i64 [[J:%.*]]) {
+; CHECK-NEXT:    [[J22:%.*]] = sub i64 0, [[J]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I]], i64 [[J22]]
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 -155
+; CHECK-NEXT:    ret ptr [[P3]]
+;
   %i2 = sub i64 %i, 5 ; i - 5
   %j2 = sub i64 5, %j ; 5 - i
   %p = getelementptr inbounds [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %i2, i64 %j2
   ret ptr %p
 }
-; CHECK-LABEL: @sub(
-; CHECK: %[[j2:[a-zA-Z0-9]+]] = sub i64 0, %j
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 %i, i64 %[[j2]]
-; CHECK: getelementptr inbounds float, ptr [[BASE_PTR]], i64 -155
 
 %struct.Packed = type <{ [3 x i32], [8 x i64] }> ; <> means packed
 
 ; Verifies we can emit correct uglygep if the address is not natually aligned.
 define ptr @packed_struct(i32 %i, i32 %j) {
+; CHECK-LABEL: define ptr @packed_struct(
+; CHECK-SAME: i32 [[I:%.*]], i32 [[J:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S:%.*]] = alloca [1024 x %struct.Packed], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[J]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [1024 x %struct.Packed], ptr [[S]], i64 0, i64 [[TMP0]], i32 1, i64 [[TMP1]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 100
+; CHECK-NEXT:    ret ptr [[UGLYGEP]]
+;
 entry:
   %s = alloca [1024 x %struct.Packed], align 16
   %add = add nsw i32 %j, 3
@@ -166,14 +237,19 @@ entry:
   %arrayidx3 = getelementptr inbounds [1024 x %struct.Packed], ptr %s, i64 0, i64 %idxprom2, i32 1, i64 %idxprom
   ret ptr %arrayidx3
 }
-; CHECK-LABEL: @packed_struct(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [1024 x %struct.Packed], ptr %s, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: %uglygep = getelementptr inbounds i8, ptr [[BASE_PTR]], i64 100
-; CHECK-NEXT: ret ptr %uglygep
 
 ; We shouldn't be able to extract the 8 from "zext(a +nuw (b + 8))",
 ; because "zext(b + 8) != zext(b) + 8"
 define ptr @zext_expr(i32 %a, i32 %b) {
+; CHECK-LABEL: define ptr @zext_expr(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[B]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[A]], [[TMP0]]
+; CHECK-NEXT:    [[I:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 [[I]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
 entry:
   %0 = add i32 %b, 8
   %1 = add nuw i32 %a, %0
@@ -181,8 +257,6 @@ entry:
   %p = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 %i
   ret ptr %p
 }
-; CHECK-LABEL: zext_expr(
-; CHECK: getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 %i
 
 ; Per http://llvm.org/docs/LangRef.html#id181, the indices of a off-bound gep
 ; should be considered sign-extended to the pointer size. Therefore,
@@ -193,20 +267,34 @@ entry:
 ; This test verifies we do not illegitimately extract the 8 from
 ;   gep base, (i32 a + 8)
 define ptr @i32_add(i32 %a) {
+; CHECK-LABEL: define ptr @i32_add(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = add i32 [[A]], 8
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
 entry:
   %i = add i32 %a, 8
   %p = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i32 %i
   ret ptr %p
 }
-; CHECK-LABEL: @i32_add(
-; CHECK: getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}}
-; CHECK-NOT: getelementptr
 
 ; Verifies that we compute the correct constant offset when the index is
 ; sign-extended and then zero-extended. The old version of our code failed to
 ; handle this case because it simply computed the constant offset as the
 ; sign-extended value of the constant part of the GEP index.
 define ptr @apint(i1 %a) {
+; CHECK-LABEL: define ptr @apint(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i1 [[A]] to i4
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i4 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr float, ptr [[TMP2]], i64 15
+; CHECK-NEXT:    ret ptr [[P1]]
+;
 entry:
   %0 = add nsw nuw i1 %a, 1
   %1 = sext i1 %0 to i4
@@ -214,39 +302,45 @@ entry:
   %p = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 %2
   ret ptr %p
 }
-; CHECK-LABEL: @apint(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, ptr [[BASE_PTR]], i64 15
 
 ; Do not trace into binary operators other than ADD, SUB, and OR.
 define ptr @and(i64 %a) {
+; CHECK-LABEL: define ptr @and(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[A]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
 entry:
   %0 = shl i64 %a, 2
   %1 = and i64 %0, 1
   %p = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 %1
   ret ptr %p
 }
-; CHECK-LABEL: @and(
-; CHECK: getelementptr [32 x [32 x float]], ptr @float_2d_array
-; CHECK-NOT: getelementptr
 
 ; The code that rebuilds an OR expression used to be buggy, and failed on this
 ; test.
 define ptr @shl_add_or(i64 %a, ptr %ptr) {
-; CHECK-LABEL: @shl_add_or(
+; CHECK-LABEL: define ptr @shl_add_or(
+; CHECK-SAME: i64 [[A:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = add i64 [[SHL]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[PTR]], i64 [[OR2]]
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, ptr [[TMP0]], i64 12
+; CHECK-NEXT:    ret ptr [[P3]]
+;
 entry:
   %shl = shl i64 %a, 2
   %add = add i64 %shl, 12
   %or = or i64 %add, 1
-; CHECK: [[OR:%or[0-9]*]] = add i64 %shl, 1
   ; ((a << 2) + 12) and 1 have no common bits. Therefore,
   ; SeparateConstOffsetFromGEP is able to extract the 12.
   ; TODO(jingyue): We could reassociate the expression to combine 12 and 1.
   %p = getelementptr float, ptr %ptr, i64 %or
-; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr float, ptr %ptr, i64 [[OR]]
-; CHECK: getelementptr float, ptr [[PTR]], i64 12
   ret ptr %p
-; CHECK-NEXT: ret
 }
 
 ; The source code used to be buggy in checking
@@ -260,40 +354,46 @@ entry:
 %struct1 = type { i64, %struct2 }
 %struct0 = type { i32, i32, ptr, [100 x %struct1] }
 define ptr @sign_mod_unsign(ptr %ptr, i64 %idx) {
-; CHECK-LABEL: @sign_mod_unsign(
+; CHECK-LABEL: define ptr @sign_mod_unsign(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
+; CHECK-NEXT:    [[PTR22:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 -3
+; CHECK-NEXT:    ret ptr [[PTR22]]
+;
 entry:
   %arrayidx = add nsw i64 %idx, -2
-; CHECK-NOT: add
   %ptr2 = getelementptr inbounds %struct0, ptr %ptr, i64 0, i32 3, i64 %arrayidx, i32 1
-; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0, ptr %ptr, i64 0, i32 3, i64 %idx, i32 1
-; CHECK: getelementptr inbounds %struct2, ptr [[PTR]], i64 -3
   ret ptr %ptr2
-; CHECK-NEXT: ret
 }
 
 ; Check that we can see through explicit trunc() instruction.
 define ptr @trunk_explicit(ptr %ptr, i64 %idx) {
-; CHECK-LABEL: @trunk_explicit(
+; CHECK-LABEL: define ptr @trunk_explicit(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151
+; CHECK-NEXT:    ret ptr [[PTR21]]
+;
 entry:
   %idx0 = trunc i64 1 to i32
   %ptr2 = getelementptr inbounds %struct0, ptr %ptr, i32 %idx0, i32 3, i64 %idx, i32 1
-; CHECK-NOT: trunc
-; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0, ptr %ptr, i64 0, i32 3, i64 %idx, i32 1
-; CHECK: getelementptr inbounds %struct2, ptr %0, i64 151
   ret ptr %ptr2
-; CHECK-NEXT: ret
 }
 
 ; Check that we can deal with trunc inserted by
 ; canonicalizeArrayIndicesToPointerSize() if size of an index is larger than
 ; that of the pointer.
 define ptr @trunk_long_idx(ptr %ptr, i64 %idx) {
-; CHECK-LABEL: @trunk_long_idx(
+; CHECK-LABEL: define ptr @trunk_long_idx(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151
+; CHECK-NEXT:    ret ptr [[PTR21]]
+;
 entry:
   %ptr2 = getelementptr inbounds %struct0, ptr %ptr, i65 1, i32 3, i64 %idx, i32 1
-; CHECK-NOT: trunc
-; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0, ptr %ptr, i64 0, i32 3, i64 %idx, i32 1
-; CHECK: getelementptr inbounds %struct2, ptr %0, i64 151
   ret ptr %ptr2
-; CHECK-NEXT: ret
 }
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index a2a08571184593..a40798665a50d2 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -86,14 +86,14 @@ attributes #0 = { nounwind readnone }
 
 ; SLEEFGNUABI:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N2v_sin(_ZGVnN2v_sin),
-; SLEEFGNUABI-SAME:   _ZGV_LLVM_Mxv_sin(_ZGVsMxv_sin)" }
+; SLEEFGNUABI-SAME:   _ZGVsMxv_sin(_ZGVsMxv_sin)" }
 ; SLEEFGNUABI:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; SLEEFGNUABI-SAME:   "_ZGV_LLVM_N4v_llvm.log10.f32(_ZGVnN4v_log10f),
-; SLEEFGNUABI-SAME:   _ZGV_LLVM_Mxv_llvm.log10.f32(_ZGVsMxv_log10f)" }
+; SLEEFGNUABI-SAME:   _ZGVsMxv_llvm.log10.f32(_ZGVsMxv_log10f)" }
 
 ; ARMPL:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N2v_sin(armpl_vsinq_f64),
-; ARMPL-SAME     _ZGV_LLVM_Mxv_sin(armpl_svsin_f64_x)" }
+; ARMPL-SAME     _ZGVsMxv_sin(armpl_svsin_f64_x)" }
 ; ARMPL:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N4v_llvm.log10.f32(armpl_vlog10q_f32),
-; ARMPL-SAME     _ZGV_LLVM_Mxv_llvm.log10.f32(armpl_svlog10_f32_x)" }
+; ARMPL-SAME     _ZGVsMxv_llvm.log10.f32(armpl_svlog10_f32_x)" }
diff --git a/llvm/test/tools/llvm-debuginfod-find/headers.test b/llvm/test/tools/llvm-debuginfod-find/headers.test
index c838d9fe7dc62d..86517f8fc5a121 100644
--- a/llvm/test/tools/llvm-debuginfod-find/headers.test
+++ b/llvm/test/tools/llvm-debuginfod-find/headers.test
@@ -16,9 +16,11 @@ RUN: env DEBUGINFOD_CACHE=%t/debuginfod-cache DEBUGINFOD_HEADERS_FILE=%S/Inputs/
 RUN:   | FileCheck --check-prefix ERR -DHEADER_FILE=%S/Inputs/headers %s
 
 NO-HEADERS: Accept: */*
+NO-HEADERS-NEXT: Accept-Encoding: {{.*}}
 NO-HEADERS-NOT: {{.}}
 
 HEADERS:      Accept: */*
+HEADERS-NEXT: Accept-Encoding: {{.*}}
 HEADERS-NEXT: A: B
 HEADERS-NEXT: C: D
 HEADERS-NEXT: E: F
diff --git a/llvm/test/tools/llvm-lipo/create-archive-input.test b/llvm/test/tools/llvm-lipo/create-archive-input.test
index 5558797db87b69..c4323811af6fd8 100644
--- a/llvm/test/tools/llvm-lipo/create-archive-input.test
+++ b/llvm/test/tools/llvm-lipo/create-archive-input.test
@@ -28,17 +28,27 @@
 # RUN: llvm-lipo %t-ir-armv7-x86_64-universal.o -thin x86_64 -output %t-ir-extracted-x86_64.o
 # RUN: cmp %t-ir-extracted-x86_64.o %t-ir-x86_64.o
 
-# RUN: llvm-ar cr %t.different_types0.a %t-i386.o %t-ir-x86_64.o
-# RUN: not llvm-lipo -create %t.different_types0.a -output /dev/null 2>&1 | FileCheck --check-prefix=ARCHIVE-WITH-MACHO-AND-IR %s
-# RUN: llvm-ar cr %t.different_types1.a %t-ir-x86_64.o %t-i386.o 
-# RUN: not llvm-lipo -create %t.different_types1.a -output /dev/null 2>&1 | FileCheck --check-prefix=ARCHIVE-WITH-IR-AND-MACHO %s
+# RUN: llvm-ar cr %t.different_types0-i386.a %t-i386.o
+# RUN: llvm-ar cr %t.different_types0-x86_64.a %t-x86_64.o %t-ir-x86_64.o
+# RUN: llvm-lipo -create %t.different_types0-i386.a %t.different_types0-x86_64.a -output %t.different_types0-universal.a
+# RUN: llvm-lipo %t.different_types0-universal.a -thin i386 -output %t.different_types0-extracted-i386.a
+# RUN: llvm-lipo %t.different_types0-universal.a -thin x86_64 -output %t.different_types0-extracted-x86_64.a
+# RUN: cmp %t.different_types0-extracted-i386.a %t.different_types0-i386.a
+# RUN: cmp %t.different_types0-extracted-x86_64.a %t.different_types0-x86_64.a
+
+# RUN: llvm-ar cr %t.different_types1-i386.a %t-i386.o
+# RUN: llvm-ar cr %t.different_types1-x86_64.a %t-ir-x86_64.o %t-x86_64.o
+# RUN: llvm-lipo -create %t.different_types1-x86_64.a %t.different_types1-i386.a -output %t.different_types1-universal.a
+# RUN: llvm-lipo %t.different_types1-universal.a -thin i386 -output %t.different_types1-extracted-i386.a
+# RUN: llvm-lipo %t.different_types1-universal.a -thin x86_64 -output %t.different_types1-extracted-x86_64.a
+# RUN: cmp %t.different_types1-extracted-i386.a %t.different_types1-i386.a
+# RUN: cmp %t.different_types1-extracted-x86_64.a %t.different_types1-x86_64.a
+
 # RUN: llvm-ar cr %t.different_architectures_ir.a %t-ir-x86_64.o %t-ir-armv7.o
 # RUN: not llvm-lipo -create %t.different_architectures_ir.a -output /dev/null 2>&1 | FileCheck --check-prefix=ARCHIVE-WITH-DIFFERENT-ARCHS %s
 
 # EMPTY-ARCHIVE: empty archive
 # ARCHIVE-WITH-DIFFERENT-ARCHS: all members must match
-# ARCHIVE-WITH-MACHO-AND-IR: is an LLVM IR object, while previous archive member {{.*}} was a MachO
-# ARCHIVE-WITH-IR-AND-MACHO: is a MachO, while previous archive member {{.*}} was an IR LLVM object
 # ARCHIVE-WITH-FAT-BINARY: fat file (not allowed in an archive)
 #
 # INFO-i386-x86_64: i386 x86_64
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index 24f3f563e9689d..c0072bcbde1b38 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -1,7 +1,7 @@
 // Magic
 RUN: printf '\377lprofr\201' > %t
 // Version
-RUN: printf '\0\01\0\0\0\0\0\10' >> %t
+RUN: printf '\0\0\0\0\10\0\0\10' >> %t
 // The rest of the header needs to be there to prevent a broken header error.
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
@@ -15,5 +15,5 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: not llvm-profdata show %t -o /dev/null 2>&1 | FileCheck %s
 
-CHECK: raw profile version mismatch: Profile uses raw profile format version = 281474976710664; expected version = {{[0-9]+}}
+CHECK: raw profile version mismatch: Profile uses raw profile format version = 134217736; expected version = {{[0-9]+}}
 CHECK-NEXT: PLEASE update this tool to version in the raw profile, or regenerate raw profile with expected version.
diff --git a/llvm/test/tools/llvm-rc/Inputs/menuex.rc b/llvm/test/tools/llvm-rc/Inputs/menuex.rc
new file mode 100644
index 00000000000000..766fffe03b87d9
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/menuex.rc
@@ -0,0 +1,14 @@
+101 MENUEX
+BEGIN
+    POPUP "File",                                           40182
+    BEGIN
+        MENUITEM "Load",                                    40011
+        MENUITEM "", 0, 0x00000800L
+
+        POPUP "Savestate Slot",                             40187
+        BEGIN
+            MENUITEM "&1",                                  40099
+            MENUITEM "&2",                                  40100
+        END
+    END
+END
diff --git a/llvm/test/tools/llvm-rc/menuex.test b/llvm/test/tools/llvm-rc/menuex.test
new file mode 100644
index 00000000000000..2a51ffd110419c
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/menuex.test
@@ -0,0 +1,25 @@
+; RUN: llvm-rc -no-preprocess /FO %t -- %p/Inputs/menuex.rc
+; RUN: llvm-readobj %t | FileCheck %s
+
+CHECK:      Resource type (int): MENU (ID 4)
+CHECK-NEXT: Resource name (int): 101
+CHECK-NEXT: Data version: 0
+CHECK-NEXT: Memory flags: 0x1030
+CHECK-NEXT: Language ID: 1033
+CHECK-NEXT: Version (major): 0
+CHECK-NEXT: Version (minor): 0
+CHECK-NEXT: Characteristics: 0
+CHECK-NEXT: Data size: 164
+CHECK-NEXT: Data: (
+CHECK-NEXT:   0000: 01000400 00000000 00000000 00000000  |................|
+CHECK-NEXT:   0010: F69C0000 81004600 69006C00 65000000  |......F.i.l.e...|
+CHECK-NEXT:   0020: 00000000 00000000 00000000 4B9C0000  |............K...|
+CHECK-NEXT:   0030: 00004C00 6F006100 64000000 00080000  |..L.o.a.d.......|
+CHECK-NEXT:   0040: 00000000 00000000 00000000 00000000  |................|
+CHECK-NEXT:   0050: 00000000 FB9C0000 81005300 61007600  |..........S.a.v.|
+CHECK-NEXT:   0060: 65007300 74006100 74006500 20005300  |e.s.t.a.t.e. .S.|
+CHECK-NEXT:   0070: 6C006F00 74000000 00000000 00000000  |l.o.t...........|
+CHECK-NEXT:   0080: 00000000 A39C0000 00002600 31000000  |..........&.1...|
+CHECK-NEXT:   0090: 00000000 00000000 A49C0000 80002600  |..............&.|
+CHECK-NEXT:   00A0: 32000000                             |2...|
+CHECK-NEXT: )
diff --git a/llvm/tools/llvm-isel-fuzzer/llvm-isel-fuzzer.cpp b/llvm/tools/llvm-isel-fuzzer/llvm-isel-fuzzer.cpp
index 9f47609394a5d3..742f7b94e116f3 100644
--- a/llvm/tools/llvm-isel-fuzzer/llvm-isel-fuzzer.cpp
+++ b/llvm/tools/llvm-isel-fuzzer/llvm-isel-fuzzer.cpp
@@ -129,20 +129,7 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(int *argc,
     exit(1);
   }
 
-  Triple TheTriple = Triple(Triple::normalize(TargetTriple));
-
-  // Get the target specific parser.
-  std::string Error;
-  const Target *TheTarget =
-      TargetRegistry::lookupTarget(codegen::getMArch(), TheTriple, Error);
-  if (!TheTarget) {
-    errs() << argv[0] << ": " << Error;
-    return 1;
-  }
-
   // Set up the pipeline like llc does.
-  std::string CPUStr = codegen::getCPUStr(),
-              FeaturesStr = codegen::getFeaturesStr();
 
   CodeGenOptLevel OLvl;
   if (auto Level = CodeGenOpt::parseLevel(OptLevel)) {
@@ -151,11 +138,9 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(int *argc,
     errs() << argv[0] << ": invalid optimization level.\n";
     return 1;
   }
-
-  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags(TheTriple);
-  TM.reset(TheTarget->createTargetMachine(
-      TheTriple.getTriple(), CPUStr, FeaturesStr, Options,
-      codegen::getExplicitRelocModel(), codegen::getExplicitCodeModel(), OLvl));
+  ExitOnError ExitOnErr(std::string(*argv[0]) + ": error:");
+  TM = ExitOnErr(codegen::createTargetMachineForTriple(
+      Triple::normalize(TargetTriple), OLvl));
   assert(TM && "Could not allocate target machine!");
 
   // Make sure we print the summary and the current unit when LLVM errors out.
diff --git a/llvm/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp b/llvm/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
index 0b2cb9598e5bf6..fcccf0e07ef8d2 100644
--- a/llvm/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
+++ b/llvm/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
@@ -198,23 +198,9 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(int *argc,
     errs() << *argv[0] << ": -mtriple must be specified\n";
     exit(1);
   }
-  Triple TargetTriple = Triple(Triple::normalize(TargetTripleStr));
-
-  std::string Error;
-  const Target *TheTarget =
-      TargetRegistry::lookupTarget(codegen::getMArch(), TargetTriple, Error);
-  if (!TheTarget) {
-    errs() << *argv[0] << ": " << Error;
-    exit(1);
-  }
-
-  TargetOptions Options =
-      codegen::InitTargetOptionsFromCodeGenFlags(TargetTriple);
-  TM.reset(TheTarget->createTargetMachine(
-      TargetTriple.getTriple(), codegen::getCPUStr(), codegen::getFeaturesStr(),
-      Options, codegen::getExplicitRelocModel(),
-      codegen::getExplicitCodeModel(), CodeGenOptLevel::Default));
-  assert(TM && "Could not allocate target machine!");
+  ExitOnError ExitOnErr(std::string(*argv[0]) + ": error:");
+  TM = ExitOnErr(codegen::createTargetMachineForTriple(
+      Triple::normalize(TargetTripleStr)));
 
   // Check that pass pipeline is specified and correct
   //
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index fdb2c1405f1237..18307935c7558b 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -2390,7 +2390,7 @@ static void traverseAllValueSites(const InstrProfRecord &Func, uint32_t VK,
       if (Symtab == nullptr)
         OS << format("%4" PRIu64, VD[V].Value);
       else
-        OS << Symtab->getFuncName(VD[V].Value);
+        OS << Symtab->getFuncOrVarName(VD[V].Value);
       OS << ", " << format("%10" PRId64, VD[V].Count) << " ] ("
          << format("%.2f%%", (VD[V].Count * 100.0 / SiteSum)) << ")\n";
     }
@@ -2644,7 +2644,7 @@ static int showInstrProfile(
       OS << "  Temporal Profile Trace " << i << " (weight=" << Traces[i].Weight
          << " count=" << Traces[i].FunctionNameRefs.size() << "):\n";
       for (auto &NameRef : Traces[i].FunctionNameRefs)
-        OS << "    " << Reader->getSymtab().getFuncName(NameRef) << "\n";
+        OS << "    " << Reader->getSymtab().getFuncOrVarName(NameRef) << "\n";
     }
   }
 
@@ -3079,7 +3079,7 @@ static int order_main(int argc, const char *argv[]) {
   WithColor::note() << "# Ordered " << Nodes.size() << " functions\n";
   for (auto &N : Nodes) {
     auto [Filename, ParsedFuncName] =
-        getParsedIRPGOFuncName(Reader->getSymtab().getFuncName(N.Id));
+        getParsedIRPGOFuncName(Reader->getSymtab().getFuncOrVarName(N.Id));
     if (!Filename.empty())
       OS << "# " << Filename << "\n";
     OS << ParsedFuncName << "\n";
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index f26b2bd5df5eda..85d7c1123fecaf 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -810,24 +810,31 @@ void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) {
     loadSymbolsFromDWARFUnit(*CompilationUnit.get());
 
   // Handles DWO sections that can either be in .o, .dwo or .dwp files.
+  uint32_t NumOfDWOMissing = 0;
   for (const auto &CompilationUnit : DebugContext->compile_units()) {
     DWARFUnit *const DwarfUnit = CompilationUnit.get();
     if (DwarfUnit->getDWOId()) {
       DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit();
       if (!DWOCU->isDWOUnit()) {
-        std::string DWOName = dwarf::toString(
-            DwarfUnit->getUnitDIE().find(
-                {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
-            "");
-        WithColor::warning()
-            << "DWO debug information for " << DWOName
-            << " was not loaded. Please check the .o, .dwo or .dwp path.\n";
+        NumOfDWOMissing++;
+        if (ShowDetailedWarning) {
+          std::string DWOName = dwarf::toString(
+              DwarfUnit->getUnitDIE().find(
+                  {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
+              "");
+          WithColor::warning() << "DWO debug information for " << DWOName
+                               << " was not loaded.\n";
+        }
         continue;
       }
       loadSymbolsFromDWARFUnit(*DWOCU);
     }
   }
 
+  if (NumOfDWOMissing)
+    WithColor::warning()
+        << " DWO debug information was not loaded for " << NumOfDWOMissing
+        << " modules. Please check the .o, .dwo or .dwp path.\n";
   if (BinaryFunctions.empty())
     WithColor::warning() << "Loading of DWARF info completed, but no binary "
                             "functions have been retrieved.\n";
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index 62eed50976d90a..dd9db338ece255 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -471,6 +471,10 @@ Error ResourceFileWriter::visitMenuResource(const RCResource *Res) {
   return writeResource(Res, &ResourceFileWriter::writeMenuBody);
 }
 
+Error ResourceFileWriter::visitMenuExResource(const RCResource *Res) {
+  return writeResource(Res, &ResourceFileWriter::writeMenuExBody);
+}
+
 Error ResourceFileWriter::visitStringTableResource(const RCResource *Base) {
   const auto *Res = cast<StringTableResource>(Base);
 
@@ -1176,6 +1180,7 @@ Error ResourceFileWriter::writeHTMLBody(const RCResource *Base) {
 
 Error ResourceFileWriter::writeMenuDefinition(
     const std::unique_ptr<MenuDefinition> &Def, uint16_t Flags) {
+  // https://learn.microsoft.com/en-us/windows/win32/api/winuser/ns-winuser-menuitemtemplate
   assert(Def);
   const MenuDefinition *DefPtr = Def.get();
 
@@ -1202,6 +1207,34 @@ Error ResourceFileWriter::writeMenuDefinition(
   return writeMenuDefinitionList(PopupPtr->SubItems);
 }
 
+Error ResourceFileWriter::writeMenuExDefinition(
+    const std::unique_ptr<MenuDefinition> &Def, uint16_t Flags) {
+  // https://learn.microsoft.com/en-us/windows/win32/menurc/menuex-template-item
+  assert(Def);
+  const MenuDefinition *DefPtr = Def.get();
+
+  padStream(sizeof(uint32_t));
+  if (auto *MenuItemPtr = dyn_cast<MenuExItem>(DefPtr)) {
+    writeInt<uint32_t>(MenuItemPtr->Type);
+    writeInt<uint32_t>(MenuItemPtr->State);
+    writeInt<uint32_t>(MenuItemPtr->Id);
+    writeInt<uint16_t>(Flags);
+    padStream(sizeof(uint16_t));
+    RETURN_IF_ERROR(writeCString(MenuItemPtr->Name));
+    return Error::success();
+  }
+
+  auto *PopupPtr = cast<PopupExItem>(DefPtr);
+  writeInt<uint32_t>(PopupPtr->Type);
+  writeInt<uint32_t>(PopupPtr->State);
+  writeInt<uint32_t>(PopupPtr->Id);
+  writeInt<uint16_t>(Flags);
+  padStream(sizeof(uint16_t));
+  RETURN_IF_ERROR(writeCString(PopupPtr->Name));
+  writeInt<uint32_t>(PopupPtr->HelpId);
+  return writeMenuExDefinitionList(PopupPtr->SubItems);
+}
+
 Error ResourceFileWriter::writeMenuDefinitionList(
     const MenuDefinitionList &List) {
   for (auto &Def : List.Definitions) {
@@ -1216,6 +1249,20 @@ Error ResourceFileWriter::writeMenuDefinitionList(
   return Error::success();
 }
 
+Error ResourceFileWriter::writeMenuExDefinitionList(
+    const MenuDefinitionList &List) {
+  for (auto &Def : List.Definitions) {
+    uint16_t Flags = Def->getResFlags();
+    // Last element receives an additional 0x80 flag.
+    const uint16_t LastElementFlag = 0x0080;
+    if (&Def == &List.Definitions.back())
+      Flags |= LastElementFlag;
+
+    RETURN_IF_ERROR(writeMenuExDefinition(Def, Flags));
+  }
+  return Error::success();
+}
+
 Error ResourceFileWriter::writeMenuBody(const RCResource *Base) {
   // At first, MENUHEADER structure. In fact, these are two WORDs equal to 0.
   // Ref: msdn.microsoft.com/en-us/library/windows/desktop/ms648018.aspx
@@ -1224,6 +1271,17 @@ Error ResourceFileWriter::writeMenuBody(const RCResource *Base) {
   return writeMenuDefinitionList(cast<MenuResource>(Base)->Elements);
 }
 
+Error ResourceFileWriter::writeMenuExBody(const RCResource *Base) {
+  // At first, MENUEX_TEMPLATE_HEADER structure.
+  // Ref:
+  // https://learn.microsoft.com/en-us/windows/win32/menurc/menuex-template-header
+  writeInt<uint16_t>(1);
+  writeInt<uint16_t>(4);
+  writeInt<uint32_t>(0);
+
+  return writeMenuExDefinitionList(cast<MenuExResource>(Base)->Elements);
+}
+
 // --- StringTableResource helpers. --- //
 
 class BundleResource : public RCResource {
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.h b/llvm/tools/llvm-rc/ResourceFileWriter.h
index 7a92f84d5d84f2..d809890ee8e820 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.h
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.h
@@ -55,6 +55,7 @@ class ResourceFileWriter : public Visitor {
   Error visitHTMLResource(const RCResource *) override;
   Error visitIconResource(const RCResource *) override;
   Error visitMenuResource(const RCResource *) override;
+  Error visitMenuExResource(const RCResource *) override;
   Error visitVersionInfoResource(const RCResource *) override;
   Error visitStringTableResource(const RCResource *) override;
   Error visitUserDefinedResource(const RCResource *) override;
@@ -150,8 +151,12 @@ class ResourceFileWriter : public Visitor {
   // MenuResource
   Error writeMenuDefinition(const std::unique_ptr<MenuDefinition> &,
                             uint16_t Flags);
+  Error writeMenuExDefinition(const std::unique_ptr<MenuDefinition> &,
+                              uint16_t Flags);
   Error writeMenuDefinitionList(const MenuDefinitionList &List);
+  Error writeMenuExDefinitionList(const MenuDefinitionList &List);
   Error writeMenuBody(const RCResource *);
+  Error writeMenuExBody(const RCResource *);
 
   // StringTableResource
   Error visitStringTableBundle(const RCResource *);
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
index 0037fec4c0247d..9e1047448831b3 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
@@ -80,6 +80,8 @@ RCParser::ParseType RCParser::parseSingleResource() {
     Result = parseIconResource();
   else if (TypeToken->equalsLower("MENU"))
     Result = parseMenuResource();
+  else if (TypeToken->equalsLower("MENUEX"))
+    Result = parseMenuExResource();
   else if (TypeToken->equalsLower("RCDATA"))
     Result = parseUserDefinedResource(RkRcData);
   else if (TypeToken->equalsLower("VERSIONINFO"))
@@ -499,7 +501,7 @@ RCParser::ParseType RCParser::parseUserDefinedResource(IntOrString Type) {
   case Kind::String:
   case Kind::Identifier:
     return std::make_unique<UserDefinedResource>(Type, read().value(),
-                                                  MemoryFlags);
+                                                 MemoryFlags);
   default:
     break;
   }
@@ -518,7 +520,7 @@ RCParser::ParseType RCParser::parseUserDefinedResource(IntOrString Type) {
   }
 
   return std::make_unique<UserDefinedResource>(Type, std::move(Data),
-                                                MemoryFlags);
+                                               MemoryFlags);
 }
 
 RCParser::ParseType RCParser::parseVersionInfoResource() {
@@ -557,7 +559,8 @@ Expected<Control> RCParser::parseControl() {
   IntOrString Class;
   std::optional<IntWithNotMask> Style;
   if (ClassUpper == "CONTROL") {
-    // CONTROL text, id, class, style, x, y, width, height [, exstyle] [, helpID]
+    // CONTROL text, id, class, style, x, y, width, height [, exstyle] [,
+    // helpID]
     ASSIGN_OR_RETURN(ClassStr, readString());
     RETURN_IF_ERROR(consumeType(Kind::Comma));
     Class = *ClassStr;
@@ -589,8 +592,8 @@ Expected<Control> RCParser::parseControl() {
     HelpID = *Val;
   }
 
-  return Control(*ClassResult, Caption, *ID, (*Args)[0], (*Args)[1],
-                 (*Args)[2], (*Args)[3], Style, ExStyle, HelpID, Class);
+  return Control(*ClassResult, Caption, *ID, (*Args)[0], (*Args)[1], (*Args)[2],
+                 (*Args)[3], Style, ExStyle, HelpID, Class);
 }
 
 RCParser::ParseType RCParser::parseBitmapResource() {
@@ -620,7 +623,14 @@ RCParser::ParseType RCParser::parseMenuResource() {
   ASSIGN_OR_RETURN(OptStatements, parseOptionalStatements());
   ASSIGN_OR_RETURN(Items, parseMenuItemsList());
   return std::make_unique<MenuResource>(std::move(*OptStatements),
-                                         std::move(*Items), MemoryFlags);
+                                        std::move(*Items), MemoryFlags);
+}
+
+RCParser::ParseType RCParser::parseMenuExResource() {
+  uint16_t MemoryFlags =
+      parseMemoryFlags(MenuExResource::getDefaultMemoryFlags());
+  ASSIGN_OR_RETURN(Items, parseMenuExItemsList());
+  return std::make_unique<MenuExResource>(std::move(*Items), MemoryFlags);
 }
 
 Expected<MenuDefinitionList> RCParser::parseMenuItemsList() {
@@ -682,6 +692,95 @@ Expected<MenuDefinitionList> RCParser::parseMenuItemsList() {
   return std::move(List);
 }
 
+Expected<MenuDefinitionList> RCParser::parseMenuExItemsList() {
+  RETURN_IF_ERROR(consumeType(Kind::BlockBegin));
+
+  MenuDefinitionList List;
+
+  // Read a set of items. Each item is of one of two kinds:
+  //   MENUITEM caption:String [,[id][, [type][, state]]]]
+  //   POPUP caption:String [,[id][, [type][, [state][, helpID]]]] { popupBody }
+  while (!consumeOptionalType(Kind::BlockEnd)) {
+    ASSIGN_OR_RETURN(ItemTypeResult, readIdentifier());
+
+    bool IsMenuItem = ItemTypeResult->equals_insensitive("MENUITEM");
+    bool IsPopup = ItemTypeResult->equals_insensitive("POPUP");
+    if (!IsMenuItem && !IsPopup)
+      return getExpectedError("MENUITEM, POPUP, END or '}'", true);
+
+    // Not a separator. Read the caption.
+    ASSIGN_OR_RETURN(CaptionResult, readString());
+
+    // If MENUITEM, expect [,[id][, [type][, state]]]]
+    if (IsMenuItem) {
+      uint32_t MenuId = 0;
+      uint32_t MenuType = 0;
+      uint32_t MenuState = 0;
+
+      if (consumeOptionalType(Kind::Comma)) {
+        auto IntId = readInt();
+        if (IntId) {
+          MenuId = *IntId;
+        }
+        if (consumeOptionalType(Kind::Comma)) {
+          auto IntType = readInt();
+          if (IntType) {
+            MenuType = *IntType;
+          }
+          if (consumeOptionalType(Kind::Comma)) {
+            auto IntState = readInt();
+            if (IntState) {
+              MenuState = *IntState;
+            }
+          }
+        }
+      }
+      List.addDefinition(std::make_unique<MenuExItem>(*CaptionResult, MenuId,
+                                                      MenuType, MenuState));
+      continue;
+    }
+
+    assert(IsPopup);
+
+    uint32_t PopupId = 0;
+    uint32_t PopupType = 0;
+    uint32_t PopupState = 0;
+    uint32_t PopupHelpID = 0;
+
+    if (consumeOptionalType(Kind::Comma)) {
+      auto IntId = readInt();
+      if (IntId) {
+        PopupId = *IntId;
+      }
+      if (consumeOptionalType(Kind::Comma)) {
+        auto IntType = readInt();
+        if (IntType) {
+          PopupType = *IntType;
+        }
+        if (consumeOptionalType(Kind::Comma)) {
+          auto IntState = readInt();
+          if (IntState) {
+            PopupState = *IntState;
+          }
+          if (consumeOptionalType(Kind::Comma)) {
+            auto IntHelpID = readInt();
+            if (IntHelpID) {
+              PopupHelpID = *IntHelpID;
+            }
+          }
+        }
+      }
+    }
+    // If POPUP, read submenu items recursively.
+    ASSIGN_OR_RETURN(SubMenuResult, parseMenuExItemsList());
+    List.addDefinition(std::make_unique<PopupExItem>(
+        *CaptionResult, PopupId, PopupType, PopupState, PopupHelpID,
+        std::move(*SubMenuResult)));
+  }
+
+  return std::move(List);
+}
+
 RCParser::ParseType RCParser::parseStringTableResource() {
   uint16_t MemoryFlags =
       parseMemoryFlags(StringTableResource::getDefaultMemoryFlags());
@@ -689,7 +788,7 @@ RCParser::ParseType RCParser::parseStringTableResource() {
   RETURN_IF_ERROR(consumeType(Kind::BlockBegin));
 
   auto Table = std::make_unique<StringTableResource>(std::move(*OptStatements),
-                                                      MemoryFlags);
+                                                     MemoryFlags);
 
   // Read strings until we reach the end of the block.
   while (!consumeOptionalType(Kind::BlockEnd)) {
@@ -753,7 +852,7 @@ Expected<std::unique_ptr<VersionInfoStmt>> RCParser::parseVersionInfoStmt() {
       PrecedingCommas.push_back(HadComma);
     }
     return std::make_unique<VersionInfoValue>(*KeyResult, std::move(Values),
-                                               std::move(PrecedingCommas));
+                                              std::move(PrecedingCommas));
   }
 
   return getExpectedError("BLOCK or VALUE", true);
@@ -835,7 +934,7 @@ RCParser::ParseOptionType RCParser::parseFontStmt(OptStmtType DialogType) {
     }
   }
   return std::make_unique<FontStmt>(*SizeResult, *NameResult, FontWeight,
-                                     FontItalic, FontCharset);
+                                    FontItalic, FontCharset);
 }
 
 RCParser::ParseOptionType RCParser::parseStyleStmt() {
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.h b/llvm/tools/llvm-rc/ResourceScriptParser.h
index a7d3b595f79437..5c01cec0f151e8 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.h
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.h
@@ -143,6 +143,7 @@ class RCParser {
   ParseType parseIconResource();
   ParseType parseHTMLResource();
   ParseType parseMenuResource();
+  ParseType parseMenuExResource();
   ParseType parseStringTableResource();
   ParseType parseUserDefinedResource(IntOrString Type);
   ParseType parseVersionInfoResource();
@@ -153,6 +154,9 @@ class RCParser {
   // Helper MENU parser.
   Expected<MenuDefinitionList> parseMenuItemsList();
 
+  // Helper MENUEX parser.
+  Expected<MenuDefinitionList> parseMenuExItemsList();
+
   // Helper VERSIONINFO parser - read the contents of a single BLOCK statement,
   // from BEGIN to END.
   Expected<std::unique_ptr<VersionInfoBlock>>
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
index ef8c34541881ac..62df7999252fe1 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
@@ -102,6 +102,12 @@ raw_ostream &MenuSeparator::log(raw_ostream &OS) const {
   return OS << "  Menu separator\n";
 }
 
+raw_ostream &MenuExItem::log(raw_ostream &OS) const {
+  OS << "  MenuExItem (" << Name << "), ID = " << Id;
+  OS << ", type: " << Type << ", state: " << State;
+  return OS << "\n";
+}
+
 raw_ostream &PopupItem::log(raw_ostream &OS) const {
   OS << "  Popup (" << Name << ")";
   logFlags(OS, Flags);
@@ -109,12 +115,25 @@ raw_ostream &PopupItem::log(raw_ostream &OS) const {
   return SubItems.log(OS);
 }
 
+raw_ostream &PopupExItem::log(raw_ostream &OS) const {
+  OS << "  Popup (" << Name << ")";
+  OS << ", type: " << Type << ", state: " << State << ", help ID: " << HelpId;
+  OS << ":\n";
+  return SubItems.log(OS);
+}
+
 raw_ostream &MenuResource::log(raw_ostream &OS) const {
   OS << "Menu (" << ResName << "):\n";
   OptStatements->log(OS);
   return Elements.log(OS);
 }
 
+raw_ostream &MenuExResource::log(raw_ostream &OS) const {
+  OS << "MenuEx (" << ResName << "):\n";
+  OptStatements->log(OS);
+  return Elements.log(OS);
+}
+
 raw_ostream &StringTableResource::log(raw_ostream &OS) const {
   OS << "StringTable:\n";
   OptStatements->log(OS);
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index 71f6a9d212e46c..09853da6c500aa 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -536,6 +536,23 @@ class MenuItem : public MenuDefinition {
   }
 };
 
+class MenuExItem : public MenuDefinition {
+public:
+  StringRef Name;
+  uint32_t Id;
+  uint32_t Type;
+  uint32_t State;
+
+  MenuExItem(StringRef Caption, uint32_t ItemId, uint32_t Type, uint32_t State)
+      : Name(Caption), Id(ItemId), Type(Type), State(State) {}
+  raw_ostream &log(raw_ostream &) const override;
+
+  MenuDefKind getKind() const override { return MkMenuItem; }
+  static bool classof(const MenuDefinition *D) {
+    return D->getKind() == MkMenuItem;
+  }
+};
+
 // POPUP statement definition.
 //
 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa381030(v=vs.85).aspx
@@ -550,8 +567,7 @@ class PopupItem : public MenuDefinition {
       : Name(Caption), Flags(ItemFlags), SubItems(std::move(SubItemsList)) {}
   raw_ostream &log(raw_ostream &) const override;
 
-  // This has an additional (0x10) flag. It doesn't match with documented
-  // 0x01 flag, though.
+  // This has an additional MF_POPUP (0x10) flag.
   uint16_t getResFlags() const override { return Flags | 0x10; }
   MenuDefKind getKind() const override { return MkPopup; }
   static bool classof(const MenuDefinition *D) {
@@ -559,6 +575,28 @@ class PopupItem : public MenuDefinition {
   }
 };
 
+class PopupExItem : public MenuDefinition {
+public:
+  StringRef Name;
+  uint32_t Id;
+  uint32_t Type;
+  uint32_t State;
+  uint32_t HelpId;
+  MenuDefinitionList SubItems;
+
+  PopupExItem(StringRef Caption, uint32_t Id, uint32_t Type, uint32_t State,
+              uint32_t HelpId, MenuDefinitionList &&SubItemsList)
+      : Name(Caption), Id(Id), Type(Type), State(State), HelpId(HelpId),
+        SubItems(std::move(SubItemsList)) {}
+  raw_ostream &log(raw_ostream &) const override;
+
+  uint16_t getResFlags() const override { return 0x01; }
+  MenuDefKind getKind() const override { return MkPopup; }
+  static bool classof(const MenuDefinition *D) {
+    return D->getKind() == MkPopup;
+  }
+};
+
 // Menu resource definition.
 class MenuResource : public OptStatementsRCResource {
 public:
@@ -579,6 +617,25 @@ class MenuResource : public OptStatementsRCResource {
   }
 };
 
+class MenuExResource : public OptStatementsRCResource {
+public:
+  MenuDefinitionList Elements;
+
+  MenuExResource(MenuDefinitionList &&Items, uint16_t Flags)
+      : OptStatementsRCResource({}, Flags), Elements(std::move(Items)) {}
+  raw_ostream &log(raw_ostream &) const override;
+
+  IntOrString getResourceType() const override { return RkMenu; }
+  Twine getResourceTypeName() const override { return "MENUEX"; }
+  Error visit(Visitor *V) const override {
+    return V->visitMenuExResource(this);
+  }
+  ResourceKind getKind() const override { return RkMenu; }
+  static bool classof(const RCResource *Res) {
+    return Res->getKind() == RkMenu;
+  }
+};
+
 // STRINGTABLE resource. Contains a list of strings, each having its unique ID.
 //
 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa381050(v=vs.85).aspx
diff --git a/llvm/tools/llvm-rc/ResourceVisitor.h b/llvm/tools/llvm-rc/ResourceVisitor.h
index 843c8d898a2112..a950cd7555ecdf 100644
--- a/llvm/tools/llvm-rc/ResourceVisitor.h
+++ b/llvm/tools/llvm-rc/ResourceVisitor.h
@@ -39,6 +39,7 @@ class Visitor {
   virtual Error visitHTMLResource(const RCResource *) = 0;
   virtual Error visitIconResource(const RCResource *) = 0;
   virtual Error visitMenuResource(const RCResource *) = 0;
+  virtual Error visitMenuExResource(const RCResource *) = 0;
   virtual Error visitStringTableResource(const RCResource *) = 0;
   virtual Error visitUserDefinedResource(const RCResource *) = 0;
   virtual Error visitVersionInfoResource(const RCResource *) = 0;
diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
index c377fe557cb169..2a327bf134601d 100644
--- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -764,31 +764,17 @@ llvm::parseReducerWorkItem(StringRef ToolName, StringRef Filename,
 
     auto SetDataLayout = [&](StringRef DataLayoutTargetTriple,
                              StringRef OldDLStr) -> std::optional<std::string> {
-      // If we are supposed to override the target triple, do so now.
+      // NB: We always call createTargetMachineForTriple() even if an explicit
+      // DataLayout is already set in the module since we want to use this
+      // callback to setup the TargetMachine rather than doing it later.
       std::string IRTargetTriple = DataLayoutTargetTriple.str();
       if (!TargetTriple.empty())
         IRTargetTriple = Triple::normalize(TargetTriple);
       TheTriple = Triple(IRTargetTriple);
       if (TheTriple.getTriple().empty())
         TheTriple.setTriple(sys::getDefaultTargetTriple());
-
-      std::string Error;
-      const Target *TheTarget =
-          TargetRegistry::lookupTarget(codegen::getMArch(), TheTriple, Error);
-      if (!TheTarget) {
-        WithColor::error(errs(), ToolName) << Error;
-        exit(1);
-      }
-
-      // Hopefully the MIR parsing doesn't depend on any options.
-      TargetOptions Options;
-      std::optional<Reloc::Model> RM = codegen::getExplicitRelocModel();
-      std::string CPUStr = codegen::getCPUStr();
-      std::string FeaturesStr = codegen::getFeaturesStr();
-      TM = std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-          TheTriple.getTriple(), CPUStr, FeaturesStr, Options, RM,
-          codegen::getExplicitCodeModel(), CodeGenOptLevel::Default));
-      assert(TM && "Could not allocate target machine!");
+      ExitOnError ExitOnErr(std::string(ToolName) + ": error: ");
+      TM = ExitOnErr(codegen::createTargetMachineForTriple(TheTriple.str()));
 
       return TM->createDataLayout().getStringRepresentation();
     };
diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
index e301ff94d82b59..b58d7cd952aff0 100644
--- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
+++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
@@ -16,6 +16,16 @@
 using namespace llvm;
 using namespace llvm::object;
 
+static DXContainerYAML::Signature dumpSignature(const DirectX::Signature &Sig) {
+  DXContainerYAML::Signature YAML;
+  for (auto Param : Sig)
+    YAML.Parameters.push_back(DXContainerYAML::SignatureParameter{
+        Param.Stream, Sig.getName(Param.NameOffset).str(), Param.Index,
+        Param.SystemValue, Param.CompType, Param.Register, Param.Mask,
+        Param.ExclusiveMask, Param.MinPrecision});
+  return YAML;
+}
+
 static Expected<DXContainerYAML::Object *>
 dumpDXContainer(MemoryBufferRef Source) {
   assert(file_magic::dxcontainer_object == identify_magic(Source.getBuffer()));
@@ -129,6 +139,15 @@ dumpDXContainer(MemoryBufferRef Source) {
 
       break;
     }
+    case dxbc::PartType::ISG1:
+      NewPart.Signature = dumpSignature(Container.getInputSignature());
+      break;
+    case dxbc::PartType::OSG1:
+      NewPart.Signature = dumpSignature(Container.getOutputSignature());
+      break;
+    case dxbc::PartType::PSG1:
+      NewPart.Signature = dumpSignature(Container.getPatchConstantSignature());
+      break;
     case dxbc::PartType::Unknown:
       break;
     }
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index cf4b629ec01977..ed9c10d971218f 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -286,24 +286,6 @@ static CodeGenOptLevel GetCodeGenOptLevel() {
   return static_cast<CodeGenOptLevel>(unsigned(CodeGenOptLevelCL));
 }
 
-// Returns the TargetMachine instance or zero if no triple is provided.
-static TargetMachine* GetTargetMachine(Triple TheTriple, StringRef CPUStr,
-                                       StringRef FeaturesStr,
-                                       const TargetOptions &Options) {
-  std::string Error;
-  const Target *TheTarget =
-      TargetRegistry::lookupTarget(codegen::getMArch(), TheTriple, Error);
-  // Some modules don't specify a triple, and this is okay.
-  if (!TheTarget) {
-    return nullptr;
-  }
-
-  return TheTarget->createTargetMachine(
-      TheTriple.getTriple(), codegen::getCPUStr(), codegen::getFeaturesStr(),
-      Options, codegen::getExplicitRelocModel(),
-      codegen::getExplicitCodeModel(), GetCodeGenOptLevel());
-}
-
 struct TimeTracerRAII {
   TimeTracerRAII(StringRef ProgramName) {
     if (TimeTrace)
@@ -611,14 +593,19 @@ int main(int argc, char **argv) {
 
   Triple ModuleTriple(M->getTargetTriple());
   std::string CPUStr, FeaturesStr;
-  TargetMachine *Machine = nullptr;
-  const TargetOptions Options =
-      codegen::InitTargetOptionsFromCodeGenFlags(ModuleTriple);
-
+  std::unique_ptr<TargetMachine> TM;
   if (ModuleTriple.getArch()) {
     CPUStr = codegen::getCPUStr();
     FeaturesStr = codegen::getFeaturesStr();
-    Machine = GetTargetMachine(ModuleTriple, CPUStr, FeaturesStr, Options);
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(ModuleTriple.str(),
+                                              GetCodeGenOptLevel());
+    if (auto E = ExpectedTM.takeError()) {
+      errs() << argv[0] << ": WARNING: failed to create target machine for '"
+             << ModuleTriple.str() << "': " << toString(std::move(E)) << "\n";
+    } else {
+      TM = std::move(*ExpectedTM);
+    }
   } else if (ModuleTriple.getArchName() != "unknown" &&
              ModuleTriple.getArchName() != "") {
     errs() << argv[0] << ": unrecognized architecture '"
@@ -626,8 +613,6 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  std::unique_ptr<TargetMachine> TM(Machine);
-
   // Override function attributes based on CPUStr, FeaturesStr, and command line
   // flags.
   codegen::setFunctionAttributes(CPUStr, FeaturesStr, *M);
diff --git a/llvm/unittests/ADT/HashingTest.cpp b/llvm/unittests/ADT/HashingTest.cpp
index 01a8a962b8e2e3..d175f8afee34a4 100644
--- a/llvm/unittests/ADT/HashingTest.cpp
+++ b/llvm/unittests/ADT/HashingTest.cpp
@@ -425,7 +425,7 @@ struct StructWithHashBuilderSupport {
   char C;
   int I;
   template <typename HasherT, llvm::support::endianness Endianness>
-  friend void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  friend void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                       const StructWithHashBuilderSupport &Value) {
     HBuilder.add(Value.C, Value.I);
   }
@@ -440,7 +440,7 @@ struct StructWithHashBuilderAndHashValueSupport {
   char C;
   int I;
   template <typename HasherT, llvm::support::endianness Endianness>
-  friend void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  friend void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                       const StructWithHashBuilderAndHashValueSupport &Value) {}
   friend hash_code
   hash_value(const StructWithHashBuilderAndHashValueSupport &Value) {
diff --git a/llvm/unittests/Analysis/BlockFrequencyInfoTest.cpp b/llvm/unittests/Analysis/BlockFrequencyInfoTest.cpp
index 9fab0a575fd65f..c1e94cd5898fbe 100644
--- a/llvm/unittests/Analysis/BlockFrequencyInfoTest.cpp
+++ b/llvm/unittests/Analysis/BlockFrequencyInfoTest.cpp
@@ -84,7 +84,7 @@ TEST_F(BlockFrequencyInfoTest, Basic) {
 
   // Scale the frequencies of BB0, BB1 and BB2 by a factor of two.
   SmallPtrSet<BasicBlock *, 4> BlocksToScale({BB1, BB2});
-  BFI.setBlockFreqAndScale(&BB0, BB0Freq * 2, BlocksToScale);
+  BFI.setBlockFreqAndScale(&BB0, BlockFrequency(BB0Freq * 2), BlocksToScale);
   EXPECT_EQ(BFI.getBlockFreq(&BB0).getFrequency(), 2 * BB0Freq);
   EXPECT_EQ(BFI.getBlockFreq(BB1).getFrequency(), 2 * BB1Freq);
   EXPECT_EQ(BFI.getBlockFreq(BB2).getFrequency(), 2 * BB2Freq);
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index 1053c1bdff07fb..e883b429169548 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -352,7 +352,7 @@ TEST_F(ScalarEvolutionsTest, CompareSCEVComplexity) {
 
 TEST_F(ScalarEvolutionsTest, CompareValueComplexity) {
   IntegerType *IntPtrTy = M.getDataLayout().getIntPtrType(Context);
-  PointerType *IntPtrPtrTy = IntPtrTy->getPointerTo();
+  PointerType *IntPtrPtrTy = PointerType::getUnqual(Context);
 
   FunctionType *FTy =
       FunctionType::get(Type::getVoidTy(Context), {IntPtrTy, IntPtrTy}, false);
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 4eed22d224588f..5bd1bb37e678e9 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -1848,13 +1848,13 @@ TEST_F(ComputeKnownFPClassTest, FCmpToClassTest_NInf) {
       fcmpToClassTest(CmpInst::FCMP_OGT, *A3->getFunction(), A3->getOperand(0),
                       A3->getOperand(1));
   EXPECT_EQ(nullptr, OgtVal);
-  EXPECT_EQ(fcNone, OgtClass);
+  EXPECT_EQ(fcAllFlags, OgtClass);
 
   auto [UleVal, UleClass] =
       fcmpToClassTest(CmpInst::FCMP_ULE, *A4->getFunction(), A4->getOperand(0),
                       A4->getOperand(1));
   EXPECT_EQ(nullptr, UleVal);
-  EXPECT_EQ(fcNone, UleClass);
+  EXPECT_EQ(fcAllFlags, UleClass);
 }
 
 TEST_F(ComputeKnownFPClassTest, FCmpToClassTest_PInf) {
@@ -1881,13 +1881,13 @@ TEST_F(ComputeKnownFPClassTest, FCmpToClassTest_PInf) {
       fcmpToClassTest(CmpInst::FCMP_OLE, *A3->getFunction(), A3->getOperand(0),
                       A3->getOperand(1));
   EXPECT_EQ(nullptr, OleVal);
-  EXPECT_EQ(fcNone, OleClass);
+  EXPECT_EQ(fcAllFlags, OleClass);
 
   auto [UgtVal, UgtClass] =
       fcmpToClassTest(CmpInst::FCMP_UGT, *A4->getFunction(), A4->getOperand(0),
                       A4->getOperand(1));
   EXPECT_EQ(nullptr, UgtVal);
-  EXPECT_EQ(fcNone, UgtClass);
+  EXPECT_EQ(fcAllFlags, UgtClass);
 }
 
 TEST_F(ComputeKnownFPClassTest, SqrtNszSignBit) {
diff --git a/llvm/unittests/Analysis/VectorFunctionABITest.cpp b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
index 466993161fe1ab..a4c6b2143fc662 100644
--- a/llvm/unittests/Analysis/VectorFunctionABITest.cpp
+++ b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
@@ -98,19 +98,6 @@ class VFABIParserTest : public ::testing::Test {
 };
 } // unnamed namespace
 
-// This test makes sure correct mangling occurs for given string.
-TEST_F(VFABIParserTest, ManglingVectorTLINames) {
-  EXPECT_EQ(
-      VFABI::mangleTLIVectorName("vec", "scalar", 3, ElementCount::getFixed(4)),
-      "_ZGV_LLVM_N4vvv_scalar(vec)");
-  EXPECT_EQ(VFABI::mangleTLIVectorName("vec", "scalar", 3,
-                                       ElementCount::getScalable(4)),
-            "_ZGV_LLVM_Nxvvv_scalar(vec)");
-  EXPECT_EQ(VFABI::mangleTLIVectorName("custom.call.v5", "custom.call", 1,
-                                       ElementCount::getFixed(5)),
-            "_ZGV_LLVM_N5v_custom.call(custom.call.v5)");
-}
-
 // This test makes sure that the demangling method succeeds only on
 // valid values of the string.
 TEST_F(VFABIParserTest, OnlyValidNames) {
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index 8f99fcc073b6d9..2adc2403eaca9e 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -37,7 +37,6 @@
 using namespace llvm;
 using namespace dwarf;
 using namespace utils;
-using ::testing::HasSubstr;
 
 namespace {
 
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp
index 7781061bcb4043..e1057b214ee4d3 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDieTest.cpp
@@ -15,7 +15,6 @@
 
 using namespace llvm;
 using namespace llvm::dwarf;
-using object::SectionedAddress;
 
 namespace {
 
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFLocationExpressionTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFLocationExpressionTest.cpp
index 5f253f647289c6..d2d8efbe92cb23 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFLocationExpressionTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFLocationExpressionTest.cpp
@@ -11,7 +11,6 @@
 #include "gtest/gtest.h"
 
 using namespace llvm;
-using object::SectionedAddress;
 
 TEST(DWARFLocationExpression, Equality) {
   EXPECT_EQ((DWARFLocationExpression{std::nullopt, {}}),
diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
index 6bc4667e710394..481a83fa535e84 100644
--- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
+++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
@@ -628,7 +628,7 @@ TEST(GSYMTest, TestAddressRangeEncodeDecode) {
   // the base address of the parent range for subranges.
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = llvm::support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   const uint64_t BaseAddr = 0x1000;
   const AddressRange Range1(0x1000, 0x1010);
@@ -651,7 +651,7 @@ static void TestAddressRangeEncodeDecodeHelper(const AddressRanges &Ranges,
                                                const uint64_t BaseAddr) {
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = llvm::support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   encodeRanges(Ranges, FW, BaseAddr);
 
@@ -1163,7 +1163,7 @@ TEST(GSYMTest, TestGsymReader) {
   constexpr uint64_t FuncSize = 0x10;
   const uint32_t Func1Name = GC.insertString("foo");
   const uint32_t Func2Name = GC.insertString("bar");
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   GC.addFunctionInfo(FunctionInfo(Func1Addr, FuncSize, Func1Name));
   GC.addFunctionInfo(FunctionInfo(Func2Addr, FuncSize, Func2Name));
   Error FinalizeErr = GC.finalize(llvm::nulls());
@@ -1201,7 +1201,7 @@ TEST(GSYMTest, TestGsymLookups) {
   // symbolication.
   GsymCreator GC;
   FunctionInfo FI(0x1000, 0x100, GC.insertString("main"));
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FI.OptLineTable = LineTable();
   const uint32_t MainFileIndex = GC.insertFile("/tmp/main.c");
   const uint32_t FooFileIndex = GC.insertFile("/tmp/foo.h");
@@ -1354,7 +1354,7 @@ TEST(GSYMTest, TestDWARFFunctionWithAddresses) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -1431,7 +1431,7 @@ TEST(GSYMTest, TestDWARFFunctionWithAddressAndOffset) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -1538,7 +1538,7 @@ TEST(GSYMTest, TestDWARFStructMethodNoMangled) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -1643,7 +1643,7 @@ TEST(GSYMTest, TestDWARFTextRanges) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -1672,7 +1672,7 @@ TEST(GSYMTest, TestEmptySymbolEndAddressOfTextRanges) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -1841,7 +1841,7 @@ TEST(GSYMTest, TestDWARFInlineInfo) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -2101,7 +2101,7 @@ TEST(GSYMTest, TestDWARFNoLines) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -2280,7 +2280,7 @@ TEST(GSYMTest, TestDWARFDeadStripAddr4) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -2420,7 +2420,7 @@ TEST(GSYMTest, TestDWARFDeadStripAddr8) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -2507,7 +2507,7 @@ static Expected<GsymReader> FinalizeEncodeAndDecode(GsymCreator &GC) {
     return std::move(FinalizeErr);
   SmallString<1024> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   llvm::Error Err = GC.encode(FW);
   if (Err)
@@ -3057,7 +3057,7 @@ TEST(GSYMTest, TestDWARFInlineRangeScopes) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -3284,7 +3284,7 @@ TEST(GSYMTest, TestDWARFEmptyInline) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -3520,7 +3520,7 @@ TEST(GSYMTest, TestFinalizeForLineTables) {
   ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -3800,7 +3800,7 @@ TEST(GSYMTest, TestRangeWarnings) {
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
@@ -4002,7 +4002,7 @@ TEST(GSYMTest, TestEmptyRangeWarnings) {
   OS.flush();
   SmallString<512> Str;
   raw_svector_ostream OutStrm(Str);
-  const auto ByteOrder = support::endian::system_endianness();
+  const auto ByteOrder = llvm::endianness::native;
   FileWriter FW(OutStrm, ByteOrder);
   ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
   Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index fd524f6067ee0e..4704d8cb492b70 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -3070,8 +3070,8 @@ TEST_F(OpenMPIRBuilderTest, CopyinBlocks) {
   OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
 
   IntegerType *Int32 = Type::getInt32Ty(M->getContext());
-  AllocaInst *MasterAddress = Builder.CreateAlloca(Int32->getPointerTo());
-  AllocaInst *PrivAddress = Builder.CreateAlloca(Int32->getPointerTo());
+  AllocaInst *MasterAddress = Builder.CreateAlloca(Builder.getPtrTy());
+  AllocaInst *PrivAddress = Builder.CreateAlloca(Builder.getPtrTy());
 
   BasicBlock *EntryBB = BB;
 
diff --git a/llvm/unittests/FuzzMutate/OperationsTest.cpp b/llvm/unittests/FuzzMutate/OperationsTest.cpp
index 82a37e02aab8c5..e8c84d8cd26798 100644
--- a/llvm/unittests/FuzzMutate/OperationsTest.cpp
+++ b/llvm/unittests/FuzzMutate/OperationsTest.cpp
@@ -48,7 +48,6 @@ using testing::Each;
 using testing::ElementsAre;
 using testing::Eq;
 using testing::Ge;
-using testing::NotNull;
 using testing::PrintToString;
 using testing::SizeIs;
 using testing::Truly;
diff --git a/llvm/unittests/IR/AsmWriterTest.cpp b/llvm/unittests/IR/AsmWriterTest.cpp
index 3e0dad89b0c353..e9da0a4b39704f 100644
--- a/llvm/unittests/IR/AsmWriterTest.cpp
+++ b/llvm/unittests/IR/AsmWriterTest.cpp
@@ -87,7 +87,7 @@ TEST(AsmWriterTest, PrintNullOperandBundle) {
   LLVMContext C;
   Type *Int32Ty = Type::getInt32Ty(C);
   FunctionType *FnTy = FunctionType::get(Int32Ty, Int32Ty, /*isVarArg=*/false);
-  Value *Callee = Constant::getNullValue(FnTy->getPointerTo());
+  Value *Callee = Constant::getNullValue(PointerType::getUnqual(C));
   Value *Args[] = {ConstantInt::get(Int32Ty, 42)};
   std::unique_ptr<BasicBlock> NormalDest(BasicBlock::Create(C));
   std::unique_ptr<BasicBlock> UnwindDest(BasicBlock::Create(C));
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index ef5f974419029d..bda57176daeba8 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -190,7 +190,7 @@ TEST_F(IRBuilderTest, IntrinsicsWithScalableVectors) {
   // LLVMScalarOrSameVectorWidth.
 
   Type *VecTy = VectorType::get(Builder.getInt32Ty(), 4, true);
-  Type *PtrToVecTy = VecTy->getPointerTo();
+  Type *PtrToVecTy = Builder.getPtrTy();
   PredTy = VectorType::get(Builder.getInt1Ty(), 4, true);
 
   ArgTys.clear();
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 637c69204addfe..20b8529b386324 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -1024,71 +1024,141 @@ TEST(InstructionsTest, ShuffleMaskQueries) {
   Constant *C7 = ConstantInt::get(Int32Ty, 7);
 
   Constant *Identity = ConstantVector::get({C0, CU, C2, C3, C4});
-  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(Identity));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(Identity)); // identity is distinguished from select
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(Identity));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(Identity)); // identity is always single source
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(Identity));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(Identity));
+  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(
+      Identity, cast<FixedVectorType>(Identity->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      Identity,
+      cast<FixedVectorType>(Identity->getType())
+          ->getNumElements())); // identity is distinguished from select
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      Identity, cast<FixedVectorType>(Identity->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      Identity, cast<FixedVectorType>(Identity->getType())
+                    ->getNumElements())); // identity is always single source
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      Identity, cast<FixedVectorType>(Identity->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      Identity, cast<FixedVectorType>(Identity->getType())->getNumElements()));
 
   Constant *Select = ConstantVector::get({CU, C1, C5});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(Select));
-  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(Select));
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(Select));
-  EXPECT_FALSE(ShuffleVectorInst::isSingleSourceMask(Select));
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(Select));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(Select));
-  
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSingleSourceMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+
   Constant *Reverse = ConstantVector::get({C3, C2, C1, CU});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(Reverse));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(Reverse));
-  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(Reverse));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(Reverse)); // reverse is always single source
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(Reverse));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(Reverse));
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())
+                   ->getNumElements())); // reverse is always single source
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
 
   Constant *SingleSource = ConstantVector::get({C2, C2, C0, CU});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(SingleSource));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(SingleSource));
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(SingleSource));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(SingleSource));
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(SingleSource));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(SingleSource));
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
 
   Constant *ZeroEltSplat = ConstantVector::get({C0, C0, CU, C0});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(ZeroEltSplat));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(ZeroEltSplat));
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(ZeroEltSplat));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(ZeroEltSplat)); // 0-splat is always single source
-  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(ZeroEltSplat));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(ZeroEltSplat));
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      ZeroEltSplat, cast<FixedVectorType>(ZeroEltSplat->getType())
+                        ->getNumElements())); // 0-splat is always single source
+  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
 
   Constant *Transpose = ConstantVector::get({C0, C4, C2, C6});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(Transpose));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(Transpose));
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(Transpose));
-  EXPECT_FALSE(ShuffleVectorInst::isSingleSourceMask(Transpose));
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(Transpose));
-  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(Transpose));
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSingleSourceMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
 
   // More tests to make sure the logic is/stays correct...
-  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(ConstantVector::get({CU, C1, CU, C3})));
-  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(ConstantVector::get({C4, CU, C6, CU})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(ConstantVector::get({C4, C1, C6, CU})));
-  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(ConstantVector::get({CU, C1, C6, C3})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(ConstantVector::get({C7, C6, CU, C4})));
-  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(ConstantVector::get({C3, CU, C1, CU})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(ConstantVector::get({C7, C5, CU, C7})));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(ConstantVector::get({C3, C0, CU, C3})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(ConstantVector::get({C4, CU, CU, C4})));
-  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(ConstantVector::get({CU, C0, CU, C0})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(ConstantVector::get({C1, C5, C3, C7})));
-  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(ConstantVector::get({C1, C3})));
+  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(
+      ConstantVector::get({CU, C1, CU, C3}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(
+      ConstantVector::get({C4, CU, C6, CU}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(
+      ConstantVector::get({C4, C1, C6, CU}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(
+      ConstantVector::get({CU, C1, C6, C3}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(
+      ConstantVector::get({C7, C6, CU, C4}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(
+      ConstantVector::get({C3, CU, C1, CU}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      ConstantVector::get({C7, C5, CU, C7}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      ConstantVector::get({C3, C0, CU, C3}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(
+      ConstantVector::get({C4, CU, CU, C4}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(
+      ConstantVector::get({CU, C0, CU, C0}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(
+      ConstantVector::get({C1, C5, C3, C7}), 4));
+  EXPECT_TRUE(
+      ShuffleVectorInst::isTransposeMask(ConstantVector::get({C1, C3}), 2));
 
   // Nothing special about the values here - just re-using inputs to reduce code. 
   Constant *V0 = ConstantVector::get({C0, C1, C2, C3});
diff --git a/llvm/unittests/IR/ShuffleVectorInstTest.cpp b/llvm/unittests/IR/ShuffleVectorInstTest.cpp
index ba1b9807cffb43..5d840ec8d67204 100644
--- a/llvm/unittests/IR/ShuffleVectorInstTest.cpp
+++ b/llvm/unittests/IR/ShuffleVectorInstTest.cpp
@@ -14,51 +14,84 @@ using namespace llvm;
 namespace {
 
 TEST(ShuffleVectorInst, isIdentityMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3}));
-  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3, -1}));
-  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, -1, 3}));
+  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3}, 4));
+  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3, -1}, 5));
+  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, -1, 3}, 4));
 
-  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 4}));
-  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, -1, 2, 4}));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3, -1}, 4));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, -1, 3}, 3));
+
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3}, 5));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3, -1}, 6));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, -1, 3}, 5));
+
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 4}, 4));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, -1, 2, 4}, 4));
 }
 
 TEST(ShuffleVectorInst, isSelectMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isSelectMask({0, 5, 6, 3}));
+  ASSERT_TRUE(ShuffleVectorInst::isSelectMask({0, 5, 6, 3}, 4));
+
+  ASSERT_FALSE(ShuffleVectorInst::isSelectMask({0, 5, 6, 3}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isSelectMask({0, 5, 6, 3}, 5));
 
-  ASSERT_FALSE(ShuffleVectorInst::isSelectMask({0, 1, 2, 3}));
+  ASSERT_FALSE(ShuffleVectorInst::isSelectMask({0, 1, 2, 3}, 4));
 }
 
 TEST(ShuffleVectorInst, isReverseMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isReverseMask({3, 2, 1, 0}));
-  ASSERT_TRUE(ShuffleVectorInst::isReverseMask({-1, -1, 1, 0}));
+  ASSERT_TRUE(ShuffleVectorInst::isReverseMask({3, 2, 1, 0}, 4));
+  ASSERT_TRUE(ShuffleVectorInst::isReverseMask({-1, -1, 1, 0}, 4));
 
-  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({4, 3, 2, 1}));
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({3, 2, 1, 0}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({-1, -1, 1, 0}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({3, 2, 1, 0}, 5));
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({-1, -1, 1, 0}, 5));
+
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({4, 3, 2, 1}, 4));
 }
 
 TEST(ShuffleVectorInst, isZeroEltSplatMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isZeroEltSplatMask({0, 0, 0, 0}));
-  ASSERT_TRUE(ShuffleVectorInst::isZeroEltSplatMask({0, -1, 0, -1}));
+  ASSERT_TRUE(ShuffleVectorInst::isZeroEltSplatMask({0, 0, 0, 0}, 4));
+  ASSERT_TRUE(ShuffleVectorInst::isZeroEltSplatMask({0, -1, 0, -1}, 4));
+
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({0, 0, 0, 0}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({0, -1, 0, -1}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({0, 0, 0, 0}, 5));
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({0, -1, 0, -1}, 5));
 
-  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({1, 1, 1, 1}));
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({1, 1, 1, 1}, 4));
 }
 
 TEST(ShuffleVectorInst, isTransposeMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isTransposeMask({0, 4, 2, 6}));
-  ASSERT_TRUE(ShuffleVectorInst::isTransposeMask({1, 5, 3, 7}));
+  ASSERT_TRUE(ShuffleVectorInst::isTransposeMask({0, 4, 2, 6}, 4));
+  ASSERT_TRUE(ShuffleVectorInst::isTransposeMask({1, 5, 3, 7}, 4));
 
-  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({2, 6, 4, 8}));
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({0, 4, 2, 6}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({1, 5, 3, 7}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({0, 4, 2, 6}, 5));
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({1, 5, 3, 7}, 5));
+
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({2, 6, 4, 8}, 4));
 }
 
 TEST(ShuffleVectorInst, isSpliceMask) {
   int Index;
 
-  ASSERT_TRUE(ShuffleVectorInst::isSpliceMask({0, 1, 2, 3}, Index));
+  ASSERT_TRUE(ShuffleVectorInst::isSpliceMask({0, 1, 2, 3}, 4, Index));
   ASSERT_EQ(0, Index);
 
-  ASSERT_TRUE(ShuffleVectorInst::isSpliceMask({1, 2, 3, 4, 5, 6, 7}, Index));
+  ASSERT_TRUE(ShuffleVectorInst::isSpliceMask({1, 2, 3, 4, 5, 6, 7}, 7, Index));
   ASSERT_EQ(1, Index);
 
-  ASSERT_FALSE(ShuffleVectorInst::isSpliceMask({4, 5, 6, 7}, Index));
+  ASSERT_FALSE(ShuffleVectorInst::isSpliceMask({0, 1, 2, 3}, 3, Index));
+  ASSERT_FALSE(
+      ShuffleVectorInst::isSpliceMask({1, 2, 3, 4, 5, 6, 7}, 6, Index));
+  ASSERT_FALSE(ShuffleVectorInst::isSpliceMask({0, 1, 2, 3}, 5, Index));
+  ASSERT_FALSE(
+      ShuffleVectorInst::isSpliceMask({1, 2, 3, 4, 5, 6, 7}, 8, Index));
+
+  ASSERT_FALSE(ShuffleVectorInst::isSpliceMask({4, 5, 6, 7}, 4, Index));
 }
 
 TEST(ShuffleVectorInst, isExtractSubvectorMask) {
diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp
index 94d8dc40defd46..da640225617d40 100644
--- a/llvm/unittests/Object/DXContainerTest.cpp
+++ b/llvm/unittests/Object/DXContainerTest.cpp
@@ -640,3 +640,139 @@ TEST(DXCFile, SigElementsExtendBeyondPart) {
       FailedWithMessage(
           "Signature elements extend beyond the size of the part"));
 }
+
+TEST(DXCFile, MalformedSignature) {
+  /*
+  The tests here exercise the DXContainer Signature section parser. These tests
+  are based on modifying the binary described by the following yaml:
+
+  --- !dxcontainer
+  Header:
+    Hash:            [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                      0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+    Version:
+      Major:           1
+      Minor:           0
+    FileSize:        128
+    PartCount:       1
+    PartOffsets:     [ 64 ]
+  Parts:
+    - Name:            ISG1
+      Size:            52
+      Signature:
+        Parameters:
+          - Stream:          0
+            Name:            AAA
+            Index:           0
+            SystemValue:     Undefined
+            CompType:        Float32
+            Register:        0
+            Mask:            7
+            ExclusiveMask:   2
+            MinPrecision:    Default
+  ...
+
+  The unmodified hex sequence is:
+
+  uint8_t Buffer[] = {
+    0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x80,
+  0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x49, 0x53, 0x47, 0x31, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x07, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x41, 0x41, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  };
+
+  */
+
+  {
+
+    // This binary says the signature has 10 parameters, but the part data is
+    // only big enough for 1.
+    uint8_t Buffer[] = {
+        0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+        0x80, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x49, 0x53, 0x47, 0x31, 0x34, 0x00, 0x00, 0x00,
+        0x0A, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x02, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x41, 0x41, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00};
+    EXPECT_THAT_EXPECTED(
+        DXContainer::create(getMemoryBuffer<164>(Buffer)),
+        FailedWithMessage(
+            "Signature parameters extend beyond the part boundary"));
+  }
+
+  {
+
+    // This binary only has one parameter, but the start offset is beyond the
+    // size of the part.
+    uint8_t Buffer[] = {
+        0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+        0x80, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x49, 0x53, 0x47, 0x31, 0x34, 0x00, 0x00, 0x00,
+        0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x02, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x41, 0x41, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00};
+    EXPECT_THAT_EXPECTED(
+        DXContainer::create(getMemoryBuffer<164>(Buffer)),
+        FailedWithMessage(
+            "Signature parameters extend beyond the part boundary"));
+  }
+
+  {
+
+    // This parameter has a name offset of 3, which is before the start of the
+    // string table.
+    uint8_t Buffer[] = {
+        0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+        0x80, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x49, 0x53, 0x47, 0x31, 0x34, 0x00, 0x00, 0x00,
+        0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x02, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x41, 0x41, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00};
+    EXPECT_THAT_EXPECTED(
+        DXContainer::create(getMemoryBuffer<164>(Buffer)),
+        FailedWithMessage("Invalid parameter name offset: name starts before "
+                          "the first name offset"));
+  }
+
+  {
+    // This parameter has a name offset of 255, which is after the end of the
+    // part.
+    uint8_t Buffer[] = {
+        0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+        0x80, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x49, 0x53, 0x47, 0x31, 0x34, 0x00, 0x00, 0x00,
+        0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x02, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x41, 0x41, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00};
+    EXPECT_THAT_EXPECTED(
+        DXContainer::create(getMemoryBuffer<164>(Buffer)),
+        FailedWithMessage("Invalid parameter name offset: name starts after "
+                          "the end of the part data"));
+  }
+}
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 76f88c058424d0..9c6d15552174cc 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -1250,23 +1250,23 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_test) {
   FuncNames.push_back("bar3");
   InstrProfSymtab Symtab;
   EXPECT_THAT_ERROR(Symtab.create(FuncNames), Succeeded());
-  StringRef R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("func1"));
+  StringRef R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("func1"));
   ASSERT_EQ(StringRef("func1"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("func2"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("func2"));
   ASSERT_EQ(StringRef("func2"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("func3"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("func3"));
   ASSERT_EQ(StringRef("func3"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("bar1"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("bar1"));
   ASSERT_EQ(StringRef("bar1"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("bar2"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("bar2"));
   ASSERT_EQ(StringRef("bar2"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("bar3"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("bar3"));
   ASSERT_EQ(StringRef("bar3"), R);
 
   // negative tests
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("bar4"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("bar4"));
   ASSERT_EQ(StringRef(), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("foo4"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("foo4"));
   ASSERT_EQ(StringRef(), R);
 
   // Now incrementally update the symtab
@@ -1275,23 +1275,23 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_test) {
   EXPECT_THAT_ERROR(Symtab.addFuncName("blah_3"), Succeeded());
 
   // Check again
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("blah_1"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("blah_1"));
   ASSERT_EQ(StringRef("blah_1"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("blah_2"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("blah_2"));
   ASSERT_EQ(StringRef("blah_2"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("blah_3"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("blah_3"));
   ASSERT_EQ(StringRef("blah_3"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("func1"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("func1"));
   ASSERT_EQ(StringRef("func1"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("func2"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("func2"));
   ASSERT_EQ(StringRef("func2"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("func3"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("func3"));
   ASSERT_EQ(StringRef("func3"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("bar1"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("bar1"));
   ASSERT_EQ(StringRef("bar1"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("bar2"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("bar2"));
   ASSERT_EQ(StringRef("bar2"), R);
-  R = Symtab.getFuncName(IndexedInstrProf::ComputeHash("bar3"));
+  R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash("bar3"));
   ASSERT_EQ(StringRef("bar3"), R);
 }
 
@@ -1331,14 +1331,14 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_module_test) {
 
     std::string IRPGOName = getIRPGOFuncName(*F);
     auto IRPGOFuncName =
-        ProfSymtab.getFuncName(IndexedInstrProf::ComputeHash(IRPGOName));
+        ProfSymtab.getFuncOrVarName(IndexedInstrProf::ComputeHash(IRPGOName));
     EXPECT_EQ(StringRef(IRPGOName), IRPGOFuncName);
     EXPECT_EQ(StringRef(Funcs[I]),
               getParsedIRPGOFuncName(IRPGOFuncName).second);
     // Ensure we can still read this old record name.
     std::string PGOName = getPGOFuncName(*F);
     auto PGOFuncName =
-        ProfSymtab.getFuncName(IndexedInstrProf::ComputeHash(PGOName));
+        ProfSymtab.getFuncOrVarName(IndexedInstrProf::ComputeHash(PGOName));
     EXPECT_EQ(StringRef(PGOName), PGOFuncName);
     EXPECT_THAT(PGOFuncName.str(), EndsWith(Funcs[I].str()));
   }
@@ -1396,9 +1396,10 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_compression_test) {
 
       // Now do the checks:
       // First sampling some data points:
-      StringRef R = Symtab.getFuncName(IndexedInstrProf::ComputeHash(FuncNames1[0]));
+      StringRef R =
+          Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash(FuncNames1[0]));
       ASSERT_EQ(StringRef("func_0"), R);
-      R = Symtab.getFuncName(IndexedInstrProf::ComputeHash(FuncNames1[1]));
+      R = Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash(FuncNames1[1]));
       ASSERT_EQ(StringRef("f oooooooooooooo_0"), R);
       for (int I = 0; I < 3; I++) {
         std::string N[4];
@@ -1407,7 +1408,8 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_compression_test) {
         N[2] = FuncNames2[2 * I];
         N[3] = FuncNames2[2 * I + 1];
         for (int J = 0; J < 4; J++) {
-          StringRef R = Symtab.getFuncName(IndexedInstrProf::ComputeHash(N[J]));
+          StringRef R =
+              Symtab.getFuncOrVarName(IndexedInstrProf::ComputeHash(N[J]));
           ASSERT_EQ(StringRef(N[J]), R);
         }
       }
diff --git a/llvm/unittests/Support/BlockFrequencyTest.cpp b/llvm/unittests/Support/BlockFrequencyTest.cpp
index cf5cf9213501c4..d0374b0a8c5194 100644
--- a/llvm/unittests/Support/BlockFrequencyTest.cpp
+++ b/llvm/unittests/Support/BlockFrequencyTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "gtest/gtest.h"
 #include <climits>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -106,12 +107,12 @@ TEST(BlockFrequencyTest, Saturate) {
   Freq /= BranchProbability(1, 2);
   EXPECT_EQ(Freq.getFrequency(), UINT64_MAX);
 
-  Freq = 0x1000000000000000ULL;
+  Freq = BlockFrequency(UINT64_C(0x1000000000000000));
   Freq /= BranchProbability(10000, 170000);
   EXPECT_EQ(Freq.getFrequency(), UINT64_MAX);
 
   // Try to cheat the multiplication overflow check.
-  Freq = 0x00000001f0000001ull;
+  Freq = BlockFrequency(UINT64_C(0x00000001f0000001));
   Freq /= BranchProbability(1000, 0xf000000f);
   EXPECT_EQ(33527736066704712ULL, Freq.getFrequency());
 }
diff --git a/llvm/unittests/Support/HashBuilderTest.cpp b/llvm/unittests/Support/HashBuilderTest.cpp
index 65d5146555d47d..7738897be4e00a 100644
--- a/llvm/unittests/Support/HashBuilderTest.cpp
+++ b/llvm/unittests/Support/HashBuilderTest.cpp
@@ -125,7 +125,7 @@ struct SimpleStruct {
 };
 
 template <typename HasherT, llvm::support::endianness Endianness>
-void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
              const SimpleStruct &Value) {
   HBuilder.add(Value.C);
   HBuilder.add(Value.I);
@@ -139,7 +139,7 @@ struct StructWithoutCopyOrMove {
   StructWithoutCopyOrMove &operator=(const StructWithoutCopyOrMove &) = delete;
 
   template <typename HasherT, llvm::support::endianness Endianness>
-  friend void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  friend void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                       const StructWithoutCopyOrMove &Value) {
     HBuilder.add(Value.I);
   }
@@ -154,9 +154,9 @@ struct /* __attribute__((packed)) */ StructWithFastHash {
   // If possible, we want to hash both `I` and `C` in a single `update`
   // call for performance concerns.
   template <typename HasherT, llvm::support::endianness Endianness>
-  friend void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  friend void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                       const StructWithFastHash &Value) {
-    if (Endianness == llvm::support::endian::system_endianness()) {
+    if (Endianness == llvm::endianness::native) {
       HBuilder.update(llvm::ArrayRef(reinterpret_cast<const uint8_t *>(&Value),
                                      sizeof(Value)));
     } else {
@@ -178,9 +178,9 @@ struct CustomContainer {
       Elements[I] = I;
   }
   template <typename HasherT, llvm::support::endianness Endianness>
-  friend void addHash(llvm::HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  friend void addHash(llvm::HashBuilder<HasherT, Endianness> &HBuilder,
                       const CustomContainer &Value) {
-    if (Endianness == llvm::support::endian::system_endianness()) {
+    if (Endianness == llvm::endianness::native) {
       HBuilder.update(llvm::ArrayRef(
           reinterpret_cast<const uint8_t *>(&Value.Size),
           sizeof(Value.Size) + Value.Size * sizeof(Value.Elements[0])));
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index e893e0d6f91685..90e26a23e87c20 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -653,6 +653,7 @@ R"(All available -march extensions for RISC-V
     zihpm               2.0
     zmmul               1.0
     zawrs               1.0
+    zfa                 1.0
     zfh                 1.0
     zfhmin              1.0
     zfinx               1.0
@@ -729,7 +730,6 @@ Experimental extensions
     zicfilp             0.2       This is a long dummy description
     zicond              1.0
     zacas               1.0
-    zfa                 0.2
     zfbfmin             0.8
     ztso                0.1
     zvbb                1.0
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 40846f52b80b8f..dc68f945e09026 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -23,7 +23,6 @@ using llvm::yaml::Hex32;
 using llvm::yaml::Hex64;
 using llvm::yaml::Hex8;
 using llvm::yaml::Input;
-using llvm::yaml::IO;
 using llvm::yaml::isNumeric;
 using llvm::yaml::MappingNormalization;
 using llvm::yaml::MappingTraits;
diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
index 643e8972cd7505..d0e8977f1245d5 100644
--- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
+++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
@@ -103,9 +103,12 @@ class FunctionSpecializationTest : public testing::Test {
     Cost CodeSize =
         TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
 
-    Cost Latency = SizeOnly ? 0 :
-        BFI.getBlockFreq(I.getParent()).getFrequency() / BFI.getEntryFreq() *
-        TTI.getInstructionCost(&I, TargetTransformInfo::TCK_Latency);
+    Cost Latency =
+        SizeOnly
+            ? 0
+            : BFI.getBlockFreq(I.getParent()).getFrequency() /
+                  BFI.getEntryFreq().getFrequency() *
+                  TTI.getInstructionCost(&I, TargetTransformInfo::TCK_Latency);
 
     return {CodeSize, Latency};
   }
diff --git a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
index bdabc34decf854..cb3d1001e4110d 100644
--- a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
@@ -33,7 +33,6 @@ namespace {
 
 using testing::DoDefault;
 using testing::Return;
-using testing::Expectation;
 using testing::Invoke;
 using testing::InvokeWithoutArgs;
 using testing::_;
diff --git a/llvm/unittests/XRay/FDRBlockVerifierTest.cpp b/llvm/unittests/XRay/FDRBlockVerifierTest.cpp
index 5af27450291abc..132952d85366c9 100644
--- a/llvm/unittests/XRay/FDRBlockVerifierTest.cpp
+++ b/llvm/unittests/XRay/FDRBlockVerifierTest.cpp
@@ -17,8 +17,6 @@ namespace llvm {
 namespace xray {
 namespace {
 
-using ::testing::ElementsAre;
-using ::testing::Not;
 using ::testing::SizeIs;
 
 TEST(FDRBlockVerifierTest, ValidBlocksV3) {
diff --git a/llvm/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp b/llvm/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp
index 6fb500149e386a..c1d991c8dbf1ce 100644
--- a/llvm/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp
+++ b/llvm/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp
@@ -49,10 +49,8 @@ using ::testing::IsEmpty;
 using ::testing::Matches;
 using ::testing::Pair;
 using ::testing::PrintToString;
-using ::testing::Property;
 using ::testing::SizeIs;
 using ::testing::UnorderedElementsAre;
-using ::testing::Value;
 
 namespace llvm {
 namespace cfi_verify {
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 1dbd821f20fdc5..2ea48904466af4 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -357,8 +357,8 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter {
   /// to the number of named operands that predicate expects. Store locations in
   /// StoreIdxForName correspond to the order in which operand names appear in
   /// predicate's argument list.
-  /// When we visit named leaf operand and WaitingForNamedOperands is not zero,
-  /// add matcher that will record operand and decrease counter.
+  /// When we visit named operand and WaitingForNamedOperands is not zero, add
+  /// matcher that will record operand and decrease counter.
   unsigned WaitingForNamedOperands = 0;
   StringMap<unsigned> StoreIdxForName;
 
@@ -997,6 +997,17 @@ Error GlobalISelEmitter::importChildMatcher(
                           to_string(*SrcChild) + ")");
   }
 
+  // Try look up SrcChild for a (named) predicate operand if there is any.
+  if (WaitingForNamedOperands) {
+    auto &ScopedNames = SrcChild->getNamesAsPredicateArg();
+    if (!ScopedNames.empty()) {
+      auto PA = ScopedNames.begin();
+      std::string Name = getScopedName(PA->getScope(), PA->getIdentifier());
+      OM.addPredicate<RecordNamedOperandMatcher>(StoreIdxForName[Name], Name);
+      --WaitingForNamedOperands;
+    }
+  }
+
   // Check for nested instructions.
   if (!SrcChild->isLeaf()) {
     if (SrcChild->getOperator()->isSubClassOf("ComplexPattern")) {
@@ -1061,13 +1072,6 @@ Error GlobalISelEmitter::importChildMatcher(
   if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild->getLeafValue())) {
     auto *ChildRec = ChildDefInit->getDef();
 
-    if (WaitingForNamedOperands) {
-      auto PA = SrcChild->getNamesAsPredicateArg().begin();
-      std::string Name = getScopedName(PA->getScope(), PA->getIdentifier());
-      OM.addPredicate<RecordNamedOperandMatcher>(StoreIdxForName[Name], Name);
-      --WaitingForNamedOperands;
-    }
-
     // Check for register classes.
     if (ChildRec->isSubClassOf("RegisterClass") ||
         ChildRec->isSubClassOf("RegisterOperand")) {
diff --git a/llvm/utils/gn/secondary/libunwind/src/BUILD.gn b/llvm/utils/gn/secondary/libunwind/src/BUILD.gn
index dd6bd577108766..ea95b1d64fdde8 100644
--- a/llvm/utils/gn/secondary/libunwind/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libunwind/src/BUILD.gn
@@ -33,6 +33,7 @@ unwind_sources = [
   "Unwind-EHABI.h",
   "Unwind-seh.cpp",
   "Unwind-sjlj.c",
+  "Unwind-wasm.c",
   "UnwindCursor.hpp",
   "UnwindLevel1-gcc-ext.c",
   "UnwindLevel1.c",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/SymbolFile/DWARF/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/SymbolFile/DWARF/BUILD.gn
index a3b8274cdde35c..ec99bf0e8ca26b 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/SymbolFile/DWARF/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/SymbolFile/DWARF/BUILD.gn
@@ -50,7 +50,6 @@ static_library("DWARF") {
     "DWARFContext.cpp",
     "DWARFDIE.cpp",
     "DWARFDataExtractor.cpp",
-    "DWARFDebugAbbrev.cpp",
     "DWARFDebugArangeSet.cpp",
     "DWARFDebugAranges.cpp",
     "DWARFDebugInfo.cpp",
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 528694b07f0330..a6314e35c1a045 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -55,7 +55,7 @@ def __init__(self, command, message):
 #
 # COMMAND that follows %dbg(ARG) is also captured. COMMAND can be
 # empty as a result of conditinal substitution.
-kPdbgRegex = "%dbg\\(([^)'\"]*)\\)(.*)"
+kPdbgRegex = "%dbg\\(([^)'\"]*)\\)((?:.|\\n)*)"
 
 
 def buildPdbgCommand(msg, cmd):
diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py
index 3cb47c605ad5ac..b2a578e0ae6d9a 100755
--- a/llvm/utils/lit/lit/main.py
+++ b/llvm/utils/lit/lit/main.py
@@ -73,7 +73,7 @@ def main(builtin_params={}):
                     "The test suite configuration requested an individual"
                     " test timeout of {0} seconds but a timeout of {1} seconds was"
                     " requested on the command line. Forcing timeout to be {1}"
-                    " seconds"
+                    " seconds."
                 ).format(lit_config.maxIndividualTestTime, opts.maxIndividualTestTime)
             )
             lit_config.maxIndividualTestTime = opts.maxIndividualTestTime
diff --git a/llvm/utils/lit/tests/Inputs/shtest-inject/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-inject/lit.cfg
index b3a86e73d21ac9..6454e22ce2145f 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-inject/lit.cfg
+++ b/llvm/utils/lit/tests/Inputs/shtest-inject/lit.cfg
@@ -1,6 +1,7 @@
 import lit
 
-preamble_commands = ['echo "THIS WAS"', 'echo "INJECTED"']
+# Check multiple commands, and check newlines.
+preamble_commands = ['echo "THIS WAS"', 'echo\n"INJECTED"']
 
 config.name = "shtest-inject"
 config.suffixes = [".txt"]
diff --git a/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/lit.local.cfg b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/lit.local.cfg
index 4cc234df4fcaaf..913495fbb3fe33 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/lit.local.cfg
+++ b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/lit.local.cfg
@@ -1,3 +1,8 @@
 import lit.formats
 
 config.test_format = lit.formats.ShTest(execute_external=True)
+config.substitutions.append(("%{cmds-with-newlines}", """
+echo abc |
+FileCheck %s &&
+false
+"""))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/run-line-with-newline.txt b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/run-line-with-newline.txt
new file mode 100644
index 00000000000000..a9cf51be9cb9a4
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/run-line-with-newline.txt
@@ -0,0 +1,2 @@
+# RUN: %{cmds-with-newlines}
+# CHECK: abc
diff --git a/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/lit.local.cfg b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/lit.local.cfg
index 3002a11ed23c5e..6b8e6244c1ae2a 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/lit.local.cfg
+++ b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/lit.local.cfg
@@ -1,3 +1,8 @@
 import lit.formats
 
 config.test_format = lit.formats.ShTest(execute_external=False)
+config.substitutions.append(("%{cmds-with-newlines}", """
+echo abc |
+FileCheck %s &&
+false
+"""))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/run-line-with-newline.txt b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/run-line-with-newline.txt
new file mode 100644
index 00000000000000..a9cf51be9cb9a4
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/run-line-with-newline.txt
@@ -0,0 +1,2 @@
+# RUN: %{cmds-with-newlines}
+# CHECK: abc
diff --git a/llvm/utils/lit/tests/shtest-inject.py b/llvm/utils/lit/tests/shtest-inject.py
index 3d34eb7161d462..5ba9d2f81268f3 100644
--- a/llvm/utils/lit/tests/shtest-inject.py
+++ b/llvm/utils/lit/tests/shtest-inject.py
@@ -14,7 +14,8 @@
 #  CHECK-TEST1-NEXT: # | THIS WAS
 #  CHECK-TEST1-NEXT: # `---{{-*}}
 #  CHECK-TEST1-NEXT: # preamble command line
-#  CHECK-TEST1-NEXT: echo "INJECTED"
+#  CHECK-TEST1-NEXT: echo
+#  CHECK-TEST1-NEXT: "INJECTED"
 #  CHECK-TEST1-NEXT: # executed command: echo INJECTED
 #  CHECK-TEST1-NEXT: # .---command stdout{{-*}}
 #  CHECK-TEST1-NEXT: # | INJECTED
diff --git a/llvm/utils/lit/tests/shtest-run-at-line.py b/llvm/utils/lit/tests/shtest-run-at-line.py
index 18086f6fa10d65..dc0b78c6b81cad 100644
--- a/llvm/utils/lit/tests/shtest-run-at-line.py
+++ b/llvm/utils/lit/tests/shtest-run-at-line.py
@@ -7,7 +7,7 @@
 # END.
 
 
-# CHECK: Testing: 6 tests
+# CHECK: Testing: 8 tests
 
 
 # In the case of the external shell, we check for only RUN lines in stderr in
@@ -48,6 +48,15 @@
 #   CHECK-NOT: RUN
 #       CHECK: --
 
+# CHECK-LABEL: FAIL: shtest-run-at-line :: external-shell/run-line-with-newline.txt
+
+#      CHECK: Command Output (stderr)
+# CHECK-NEXT: --
+# CHECK-NEXT: {{^}}RUN: at line 1: echo abc |
+# CHECK-NEXT: FileCheck {{.*}} &&
+# CHECK-NEXT: false
+#  CHECK-NOT: RUN
+
 
 # CHECK-LABEL: FAIL: shtest-run-at-line :: internal-shell/basic.txt
 
@@ -87,3 +96,16 @@
 # CHECK-NEXT: # executed command: echo 'foo baz'
 # CHECK-NEXT: # executed command: FileCheck {{.*}}
 # CHECK-NOT:  RUN
+
+# CHECK-LABEL: FAIL: shtest-run-at-line :: internal-shell/run-line-with-newline.txt
+
+#      CHECK: Command Output (stdout)
+# CHECK-NEXT: --
+# CHECK-NEXT: # RUN: at line 1
+# CHECK-NEXT: echo abc |
+# CHECK-NEXT: FileCheck {{.*}} &&
+# CHECK-NEXT: false
+# CHECK-NEXT: # executed command: echo abc
+# CHECK-NEXT: # executed command: FileCheck {{.*}}
+# CHECK-NEXT: # executed command: false
+#  CHECK-NOT: RUN
diff --git a/mlir/docs/Dialects/ArmSME.md b/mlir/docs/Dialects/ArmSME.md
new file mode 100644
index 00000000000000..ab7c9ffe7aa92f
--- /dev/null
+++ b/mlir/docs/Dialects/ArmSME.md
@@ -0,0 +1,16 @@
+# 'ArmSME' Dialect
+
+Basic dialect to target Arm SME architectures This dialect contains the
+definitions necessary to target Arm SME scalable matrix operations.
+
+## References
+* https://developer.arm.com/documentation/ddi0616
+* https://developer.arm.com/documentation/ddi0602/2023-03/SME-Instructions
+
+## Operations
+
+[include "Dialects/ArmSMEOps.md"]
+
+## Operations for LLVM IR Intrinsics
+
+[include "Dialects/ArmSMEIntrinsicOps.md"]
diff --git a/mlir/docs/Dialects/MemRef.md b/mlir/docs/Dialects/MemRef.md
index 2e2adaf7afa954..1d926fc3852396 100644
--- a/mlir/docs/Dialects/MemRef.md
+++ b/mlir/docs/Dialects/MemRef.md
@@ -11,66 +11,3 @@ before adding or changing any operation in this dialect.**
 
 [include "Dialects/MemRefOps.md"]
 
-### 'dma_start' operation
-
-Syntax:
-
-```
-operation ::= `memref.dma_start` ssa-use`[`ssa-use-list`]` `,`
-               ssa-use`[`ssa-use-list`]` `,` ssa-use `,`
-               ssa-use`[`ssa-use-list`]` (`,` ssa-use `,` ssa-use)?
-              `:` memref-type `,` memref-type `,` memref-type
-```
-
-Starts a non-blocking DMA operation that transfers data from a source memref to
-a destination memref. The operands include the source and destination memref's
-each followed by its indices, size of the data transfer in terms of the number
-of elements (of the elemental type of the memref), a tag memref with its
-indices, and optionally two additional arguments corresponding to the stride (in
-terms of number of elements) and the number of elements to transfer per stride.
-The tag location is used by a dma_wait operation to check for completion. The
-indices of the source memref, destination memref, and the tag memref have the
-same restrictions as any load/store operation in an affine context (whenever DMA
-operations appear in an affine context). See
-[restrictions on dimensions and symbols](Affine.md/#restrictions-on-dimensions-and-symbols)
-in affine contexts. This allows powerful static analysis and transformations in
-the presence of such DMAs including rescheduling, pipelining / overlap with
-computation, and checking for matching start/end operations. The source and
-destination memref need not be of the same dimensionality, but need to have the
-same elemental type.
-
-For example, a `memref.dma_start` operation that transfers 32 vector elements
-from a memref `%src` at location `[%i, %j]` to memref `%dst` at `[%k, %l]` would
-be specified as shown below.
-
-Example:
-
-```mlir
-%size = arith.constant 32 : index
-%tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 4>
-%idx = arith.constant 0 : index
-memref.dma_start %src[%i, %j], %dst[%k, %l], %size, %tag[%idx] :
-     memref<40 x 8 x vector<16xf32>, affine_map<(d0, d1) -> (d0, d1)>, 0>,
-     memref<2 x 4 x vector<16xf32>, affine_map<(d0, d1) -> (d0, d1)>, 2>,
-     memref<1 x i32>, affine_map<(d0) -> (d0)>, 4>
-```
-
-### 'dma_wait' operation
-
-Syntax:
-
-```
-operation ::= `memref.dma_wait` ssa-use`[`ssa-use-list`]` `,` ssa-use `:` memref-type
-```
-
-Blocks until the completion of a DMA operation associated with the tag element
-specified with a tag memref and its indices. The operands include the tag memref
-followed by its indices and the number of elements associated with the DMA being
-waited on. The indices of the tag memref have the same restrictions as
-load/store indices.
-
-Example:
-
-```mlir
-memref.dma_wait %tag[%idx], %size : memref<1 x i32, affine_map<(d0) -> (d0)>, 4>
-```
diff --git a/mlir/docs/Dialects/Transform.md b/mlir/docs/Dialects/Transform.md
index 0a59d4b32c7eb6..a24099c5ba9c9c 100644
--- a/mlir/docs/Dialects/Transform.md
+++ b/mlir/docs/Dialects/Transform.md
@@ -423,6 +423,10 @@ ops rather than having the methods directly act on the payload IR.
 
 [include "Dialects/BufferizationTransformOps.md"]
 
+## Func Transform Operations
+
+[include "Dialects/FuncTransformOps.md"]
+
 ## GPU Transform Operations
 
 [include "Dialects/GPUTransformOps.md"]
diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index 9a7cfd1f9bebc3..95a38207b7f854 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -458,7 +458,7 @@ program has been run through the passes. This provides several benefits:
 In some situations it may be useful to run a pass pipeline within another pass,
 to allow configuring or filtering based on some invariants of the current
 operation being operated on. For example, the
-[Inliner Pass](Passes.md/#-inline-inline-function-calls) may want to run
+[Inliner Pass](Passes.md/#-inline) may want to run
 intraprocedural simplification passes while it is inlining to produce a better
 cost model, and provide more optimal inlining. To enable this, passes may run an
 arbitrary `OpPassManager` on the current operation being operated on or any
diff --git a/mlir/docs/Rationale/RationaleLinalgDialect.md b/mlir/docs/Rationale/RationaleLinalgDialect.md
index 237a8e3b6bda6c..7b5137ede3ae74 100644
--- a/mlir/docs/Rationale/RationaleLinalgDialect.md
+++ b/mlir/docs/Rationale/RationaleLinalgDialect.md
@@ -384,7 +384,7 @@ Affine dialects in particular, Linalg takes the following decisions.
   multi-for loops with induction variables independent of each other (referred
   to as hyper-rectangular iteration domains in the literature) such as the
   proposed
-  [affine.parallel]((https://llvm.discourse.group/t/rfc-add-affine-parallel/350)
+  [affine.parallel](https://llvm.discourse.group/t/rfc-add-affine-parallel/350)
   are sufficient in the majority of cases.
 - **Declarative Tiling**: the *tiling* transformation is ubiquitous in HPC code
   generation. It can be seen as a decomposition of either the iteration space or
diff --git a/mlir/docs/Traits/_index.md b/mlir/docs/Traits/_index.md
index 6a9c650aca96b7..3fa24ec77107f0 100644
--- a/mlir/docs/Traits/_index.md
+++ b/mlir/docs/Traits/_index.md
@@ -40,7 +40,7 @@ Operation traits may also provide a `verifyTrait` or `verifyRegionTrait` hook
 that is called when verifying the concrete operation. The difference between
 these two is that whether the verifier needs to access the regions, if so, the
 operations in the regions will be verified before the verification of this
-trait. The [verification order](DefiningDialects/Operations.md/#verification-ordering)
+trait. The [verification order](../DefiningDialects/Operations.md/#verification-ordering)
 determines when a verifier will be invoked.
 
 ```c++
@@ -155,7 +155,7 @@ class MyType : public Type::TypeBase<MyType, ..., MyTrait, MyParametricTrait<10>
 
 ### Attaching Operation Traits in ODS
 
-To use an operation trait in the [ODS](DefiningDialects/Operations.md) framework, we need to
+To use an operation trait in the [ODS](../DefiningDialects/Operations.md) framework, we need to
 provide a definition of the trait class. This can be done using the
 `NativeOpTrait` and `ParamNativeOpTrait` classes. `ParamNativeOpTrait` provides
 a mechanism in which to specify arguments to a parametric trait class with an
@@ -177,7 +177,7 @@ These can then be used in the `traits` list of an op definition:
 def OpWithInferTypeInterfaceOp : Op<...[MyTrait, MyParametricTrait<10>]> { ... }
 ```
 
-See the documentation on [operation definitions](DefiningDialects/Operations.md) for more
+See the documentation on [operation definitions](../DefiningDialects/Operations.md) for more
 details.
 
 ## Using a Trait
@@ -186,7 +186,7 @@ Traits may be used to provide additional methods, static fields, or other
 information directly on the concrete object. `Traits` internally become `Base`
 classes of the concrete operation, so all of these are directly accessible. To
 expose this information opaquely to transformations and analyses,
-[`interfaces`](Interfaces.md) may be used.
+[`interfaces`](../Interfaces.md) may be used.
 
 To query if a specific object contains a specific trait, the `hasTrait<>` method
 may be used. This takes as a template parameter the trait class, which is the
@@ -231,7 +231,7 @@ This trait is carried by region holding operations that define a new scope for
 automatic allocation. Such allocations are automatically freed when control is
 transferred back from the regions of such operations. As an example, allocations
 performed by
-[`memref.alloca`](Dialects/MemRef.md/#memrefalloca-mlirmemrefallocaop) are
+[`memref.alloca`](../Dialects/MemRef.md/#memrefalloca-memrefallocaop) are
 automatically freed when control leaves the region of its closest surrounding op
 that has the trait AutomaticAllocationScope.
 
@@ -241,7 +241,8 @@ that has the trait AutomaticAllocationScope.
 
 This trait adds the property that the operation is known to have
 [broadcast-compatible](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-operands and that its result type is compatible with the inferred broadcast shape. See [The `Broadcastable` Trait](Traits/Broadcastable.md) for details.
+operands and that its result type is compatible with the inferred broadcast shape. 
+See [The `Broadcastable` Trait](Broadcastable.md) for details.
 
 ### Commutative
 
@@ -290,7 +291,7 @@ foo.region_op {
 ```
 
 This trait is an important structural property of the IR, and enables operations
-to have [passes](PassManagement.md) scheduled under them.
+to have [passes](../PassManagement) scheduled under them.
 
 ### MemRefsNormalizable
 
@@ -302,8 +303,7 @@ type where those references can be 'normalized'. In cases where an associated
 operations can be modified so that the `MemRef` has an identity layout
 specification. This can be implemented by associating the operation with its own
 index expression that can express the equivalent of the memory-layout
-specification of the MemRef type. See [the -normalize-memrefs pass].
-(https://mlir.llvm.org/docs/Passes/#-normalize-memrefs-normalize-memrefs)
+specification of the MemRef type. See [the -normalize-memrefs pass](../Passes.md/#-normalize-memrefs).
 
 ### Single Block Region
 
@@ -325,18 +325,18 @@ that the single block must terminate with `TerminatorOpType`.
 *   `OpTrait::SymbolTable` -- `SymbolTable`
 
 This trait is used for operations that define a
-[`SymbolTable`](SymbolsAndSymbolTables.md#symbol-table).
+[`SymbolTable`](../SymbolsAndSymbolTables.md/#symbol-table).
 
 ### Terminator
 
 *   `OpTrait::IsTerminator` -- `Terminator`
 
 This trait provides verification and functionality for operations that are known
-to be [terminators](LangRef.md#terminator-operations).
+to be [terminators](../LangRef.md/#control-flow-and-ssacfg-regions).
 
 *   `OpTrait::NoTerminator` -- `NoTerminator`
 
 This trait removes the requirement on regions held by an operation to have
-[terminator operations](LangRef.md#terminator-operations) at the end of a block.
+[terminator operations](../LangRef.md/#control-flow-and-ssacfg-regions) at the end of a block.
 This requires that these regions have a single block. An example of operation
 using this trait is the top-level `ModuleOp`.
diff --git a/mlir/docs/Tutorials/transform/ChH.md b/mlir/docs/Tutorials/transform/ChH.md
index 0696853dda1cf7..7c12728ac32466 100644
--- a/mlir/docs/Tutorials/transform/ChH.md
+++ b/mlir/docs/Tutorials/transform/ChH.md
@@ -497,6 +497,20 @@ bufferization is directly available as a transform operation.
   function_boundary_type_conversion = 1 : i32 }
 ```
 
+One-shot bufferization itself does not produce buffer deallocations, which may
+lead to leaks. So we have to run the buffer deallocation pass pipeline to avoid
+them. Note that the transform dialect seamlessly runs named passes and pass
+pipelines: if desired, one could replace complex `--pass-pipeline expressions`
+with operations. Note that we apply the pipeline to functions rather than entire
+module to avoid running it on the transform IR that is contained in the module.
+
+```mlir
+%f = transform.structured.match ops{["func.func"]} in %arg1
+  : (!transform.any_op) -> !transform.any_op
+transform.apply_registered_pass "buffer-deallocation-pipeline" to %f
+  : (!transform.any_op) -> !transform.any_op
+```
+
 In this particular case, the transformed IR could be directly bufferized. This
 is not always the case in general as some operations, in particular
 `tensor.empty` may not be bufferizable. Such operations need to be removed
diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h
index fecbeaf6b0f9d6..7e47e54e7361d5 100644
--- a/mlir/include/mlir-c/Dialect/SparseTensor.h
+++ b/mlir/include/mlir-c/Dialect/SparseTensor.h
@@ -26,20 +26,20 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(SparseTensor, sparse_tensor);
 /// If updating, keep them in sync and update the static_assert in the impl
 /// file.
 enum MlirSparseTensorDimLevelType {
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_DENSE = 4,                     // 0b00001_00
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED = 8,                // 0b00010_00
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_NU = 9,             // 0b00010_01
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_NO = 10,            // 0b00010_10
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_NU_NO = 11,         // 0b00010_11
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON = 16,                // 0b00100_00
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NU = 17,             // 0b00100_01
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NO = 18,             // 0b00100_10
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NU_NO = 19,          // 0b00100_11
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_WITH_HI = 32,       // 0b01000_00
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_WITH_HI_NU = 33,    // 0b01000_01
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_WITH_HI_NO = 34,    // 0b01000_10
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_WITH_HI_NU_NO = 35, // 0b01000_11
-  MLIR_SPARSE_TENSOR_DIM_LEVEL_TWO_OUT_OF_FOUR = 64,          // 0b10000_00
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_DENSE = 4,                   // 0b00001_00
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED = 8,              // 0b00010_00
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_NU = 9,           // 0b00010_01
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_NO = 10,          // 0b00010_10
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_NU_NO = 11,       // 0b00010_11
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON = 16,              // 0b00100_00
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NU = 17,           // 0b00100_01
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NO = 18,           // 0b00100_10
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NU_NO = 19,        // 0b00100_11
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_LOOSE_COMPRESSED = 32,       // 0b01000_00
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_LOOSE_COMPRESSED_NU = 33,    // 0b01000_01
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_LOOSE_COMPRESSED_NO = 34,    // 0b01000_10
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_LOOSE_COMPRESSED_NU_NO = 35, // 0b01000_11
+  MLIR_SPARSE_TENSOR_DIM_LEVEL_TWO_OUT_OF_FOUR = 64,        // 0b10000_00
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h
index a6408317db69e6..e361f33a0d8364 100644
--- a/mlir/include/mlir-c/IR.h
+++ b/mlir/include/mlir-c/IR.h
@@ -576,6 +576,10 @@ MLIR_CAPI_EXPORTED intptr_t mlirOperationGetNumSuccessors(MlirOperation op);
 MLIR_CAPI_EXPORTED MlirBlock mlirOperationGetSuccessor(MlirOperation op,
                                                        intptr_t pos);
 
+/// Set `pos`-th successor of the operation.
+MLIR_CAPI_EXPORTED void
+mlirOperationSetSuccessor(MlirOperation op, intptr_t pos, MlirBlock block);
+
 /// Returns true if this operation defines an inherent attribute with this name.
 /// Note: the attribute can be optional, so
 /// `mlirOperationGetInherentAttributeByName` can still return a null attribute.
diff --git a/mlir/include/mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h b/mlir/include/mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h
index 142f77c976ffc3..21bd191aa9dc8c 100644
--- a/mlir/include/mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h
+++ b/mlir/include/mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h
@@ -19,6 +19,7 @@ namespace mlir {
 class DialectRegistry;
 class LLVMTypeConverter;
 class RewritePatternSet;
+class SymbolTable;
 
 /// Collect the default pattern to convert a FuncOp to the LLVM dialect. If
 /// `emitCWrappers` is set, the pattern will also produce functions
@@ -31,8 +32,18 @@ void populateFuncToLLVMFuncOpConversionPattern(LLVMTypeConverter &converter,
 /// conversion patterns capture the LLVMTypeConverter and the LowerToLLVMOptions
 /// by reference meaning the references have to remain alive during the entire
 /// pattern lifetime.
-void populateFuncToLLVMConversionPatterns(LLVMTypeConverter &converter,
-                                          RewritePatternSet &patterns);
+///
+/// The `symbolTable` parameter can be used to speed up function lookups in the
+/// module. It's good to provide it, but only if we know that the patterns will
+/// be applied to a single module and the symbols referenced by the symbol table
+/// will not be removed and new symbols will not be added during the usage of
+/// the patterns. If provided, the lookups will have O(calls) cumulative
+/// runtime, otherwise O(calls * functions). The symbol table is currently not
+/// needed if `converter.getOptions().useBarePtrCallConv` is `true`, but it's
+/// not an error to provide it anyway.
+void populateFuncToLLVMConversionPatterns(
+    LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    const SymbolTable *symbolTable = nullptr);
 
 void registerConvertFuncToLLVMInterface(DialectRegistry &registry);
 
diff --git a/mlir/include/mlir/Debug/CLOptionsSetup.h b/mlir/include/mlir/Debug/CLOptionsSetup.h
index ab8e7b436a1e9f..a30ffe0ff1af48 100644
--- a/mlir/include/mlir/Debug/CLOptionsSetup.h
+++ b/mlir/include/mlir/Debug/CLOptionsSetup.h
@@ -51,6 +51,9 @@ class DebugConfig {
   /// Get the filename to use for logging actions.
   StringRef getLogActionsTo() const { return logActionsToFlag; }
 
+  /// Get the filename to use for profiling actions.
+  StringRef getProfileActionsTo() const { return profileActionsToFlag; }
+
   /// Set a location breakpoint manager to filter out action logging based on
   /// the attached IR location in the Action context. Ownership stays with the
   /// caller.
@@ -71,6 +74,9 @@ class DebugConfig {
   /// Log action execution to the given file (or "-" for stdout)
   std::string logActionsToFlag;
 
+  /// Profile action execution to the given file (or "-" for stdout)
+  std::string profileActionsToFlag;
+
   /// Location Breakpoints to filter the action logging.
   std::vector<tracing::BreakpointManager *> logActionLocationFilter;
 };
diff --git a/mlir/include/mlir/Debug/Observers/ActionProfiler.h b/mlir/include/mlir/Debug/Observers/ActionProfiler.h
new file mode 100644
index 00000000000000..50ee4f0baf2b09
--- /dev/null
+++ b/mlir/include/mlir/Debug/Observers/ActionProfiler.h
@@ -0,0 +1,51 @@
+//===- ActionProfiler.h -  Profiling Actions *- C++ -*-=======================//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRACING_OBSERVERS_ACTIONPROFILER_H
+#define MLIR_TRACING_OBSERVERS_ACTIONPROFILER_H
+
+#include "mlir/Debug/ExecutionContext.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <chrono>
+#include <mutex>
+
+namespace mlir {
+namespace tracing {
+
+/// This class defines an observer that profiles events before and after
+/// execution on the provided stream. The events are stored in the Chrome trace
+/// event format.
+struct ActionProfiler : public ExecutionContext::Observer {
+  ActionProfiler(raw_ostream &os)
+      : os(os), startTime(std::chrono::steady_clock::now()) {
+    os << "[";
+  }
+
+  ~ActionProfiler() override { os << "]"; }
+
+  void beforeExecute(const ActionActiveStack *action, Breakpoint *breakpoint,
+                     bool willExecute) override;
+  void afterExecute(const ActionActiveStack *action) override;
+
+private:
+  void print(const ActionActiveStack *action, llvm::StringRef phase);
+
+  raw_ostream &os;
+  std::chrono::time_point<std::chrono::steady_clock> startTime;
+  bool printComma = false;
+
+  /// A mutex used to guard profiling.
+  std::mutex mutex;
+};
+
+} // namespace tracing
+} // namespace mlir
+
+#endif // MLIR_TRACING_OBSERVERS_ACTIONPROFILER_H
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 891a6661ca87be..1036e93a039240 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -349,7 +349,8 @@ def AffineVectorize : Pass<"affine-super-vectorize", "func::FuncOp"> {
   let dependentDialects = ["vector::VectorDialect"];
   let options = [
     ListOption<"vectorSizes", "virtual-vector-size", "int64_t",
-               "Specify an n-D virtual vector size for vectorization">,
+               "Specify an n-D virtual vector size for vectorization. "
+               "This must be greater than zero.">,
     // Optionally, the fixed mapping from loop to fastest varying MemRef
     // dimension for all the MemRefs within a loop pattern:
     //   the index represents the loop depth, the value represents the k^th
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
index 2c2353bdc81027..cbc6147cb81e22 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -20,15 +20,10 @@ namespace arith {
 
 #define GEN_PASS_DECL
 #include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
-#define GEN_PASS_DECL_ARITHINTRANGEOPTS
-#include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
 
 class WideIntEmulationConverter;
 class NarrowTypeEmulationConverter;
 
-/// Create a pass to bufferize Arith ops.
-std::unique_ptr<Pass> createArithBufferizePass();
-
 /// Create a pass to bufferize arith.constant ops.
 std::unique_ptr<Pass> createConstantBufferizePass(uint64_t alignment = 0);
 
@@ -68,13 +63,6 @@ void populateExpandBFloat16Patterns(RewritePatternSet &patterns);
 /// Add patterns to expand Arith ops.
 void populateArithExpandOpsPatterns(RewritePatternSet &patterns);
 
-/// Create a pass to legalize Arith ops.
-std::unique_ptr<Pass> createArithExpandOpsPass();
-
-/// Create a pass to legalize Arith ops with specified configuration.
-std::unique_ptr<Pass>
-createArithExpandOpsPass(const ArithExpandOpsOptions &options);
-
 /// Create a pass to replace signed ops with unsigned ones where they are proven
 /// equivalent.
 std::unique_ptr<Pass> createArithUnsignedWhenEquivalentPass();
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
index 77575b0b5a57df..4096e309199e98 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
@@ -11,7 +11,7 @@
 
 include "mlir/Pass/PassBase.td"
 
-def ArithBufferize : Pass<"arith-bufferize", "ModuleOp"> {
+def ArithBufferizePass : Pass<"arith-bufferize", "ModuleOp"> {
   let summary = "Bufferize Arith dialect ops.";
   let description = [{
     This pass bufferizes arith dialect ops.
@@ -21,16 +21,14 @@ def ArithBufferize : Pass<"arith-bufferize", "ModuleOp"> {
     multi-threading. Most other bufferization passes can run in parallel at
     function granularity.
   }];
-  let constructor = "mlir::arith::createArithBufferizePass()";
   let options = [
     Option<"alignment", "alignment", "unsigned", /*default=*/"0",
            "Create global memrefs with a specified alignment">,
   ];
 }
 
-def ArithExpandOps : Pass<"arith-expand"> {
+def ArithExpandOpsPass : Pass<"arith-expand"> {
   let summary = "Legalize Arith ops to be convertible to LLVM.";
-  let constructor = "mlir::arith::createArithExpandOpsPass()";
   let dependentDialects = ["vector::VectorDialect"];
   let options = [
     Option<"includeBf16", "include-bf16", "bool", /*default=*/"false",
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.h b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.h
index f947fc8fe1631b..b27ceca215dad4 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.h
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.h
@@ -29,6 +29,9 @@
 #include "mlir/Dialect/ArmSME/IR/ArmSMEDialect.h.inc"
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/ArmSME/IR/ArmSME.h.inc"
+#include "mlir/Dialect/ArmSME/IR/ArmSMEOps.h.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.h.inc"
 
 #endif // MLIR_DIALECT_ARMSME_IR_ARMSME_H
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td
index 66a432ea1b171e..18b9bd7a107feb 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td
@@ -1,4 +1,4 @@
-//===-- ArmSME.td - ArmSME dialect operation definitions ---*- tablegen -*-===//
+//===-- ArmSME.td - ArmSME dialect definitions ------------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,22 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the ArmSME dialect and contains intrinsic ops to lower to
-// LLVM IR.
+// This file contains the definition of the ArmSME dialect as well as some
+// shared definitions.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMSME_OPS
-#define ARMSME_OPS
+#ifndef ARMSME
+#define ARMSME
 
-include "mlir/IR/EnumAttr.td"
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/DialectBase.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
-include "mlir/Interfaces/InferTypeOpInterface.td"
 
 //===----------------------------------------------------------------------===//
-// ArmSME dialect definition
+// ArmSME Dialect
 //===----------------------------------------------------------------------===//
 
 def ArmSME_Dialect : Dialect {
@@ -45,616 +42,11 @@ def ArmSME_Dialect : Dialect {
 // ArmSME type definitions
 //===----------------------------------------------------------------------===//
 
-class SMETileType<Type datatype, list<int> dims, string description>
-  : ShapedContainerType<[datatype],
-      And<[IsVectorOfRankPred<[2]>, allDimsScalableVectorTypePred,
-           IsVectorOfShape<dims>]>,
-  description>;
-
-def nxnxv16i8  : SMETileType<I8,   [16, 16], "vector<[16]x[16]xi8>">;
-def nxnxv8i16  : SMETileType<I16,  [8,  8 ], "vector<[8]x[8]xi16>">;
-def nxnxv4i32  : SMETileType<I32,  [4,  4 ], "vector<[4]x[4]xi32>">;
-def nxnxv2i64  : SMETileType<I64,  [2,  2 ], "vector<[2]x[2]xi64>">;
-def nxnxv1i128 : SMETileType<I128, [1,  1 ], "vector<[1]x[1]xi128>">;
-
-def nxnxv8f16  : SMETileType<F16,  [8,  8 ], "vector<[8]x[8]xf16>">;
-def nxnxv8bf16 : SMETileType<BF16, [8,  8 ], "vector<[8]x[8]xbf16>">;
-def nxnxv4f32  : SMETileType<F32,  [4,  4 ], "vector<[4]x[4]xf32>">;
-def nxnxv2f64  : SMETileType<F64,  [2,  2 ], "vector<[2]x[2]xf64>">;
-
-def SMETile : AnyTypeOf<[nxnxv16i8, nxnxv8i16, nxnxv4i32, nxnxv2i64, nxnxv1i128,
-                         nxnxv8f16, nxnxv8bf16, nxnxv4f32, nxnxv2f64]>;
-
 def SVEVector : ScalableVectorOfRankAndLengthAndType<
   [1], [16, 8, 4, 2, 1], [I8, I16, I32, I64, I128, F16, BF16, F32, F64]>;
 
 def SVEPredicate : ScalableVectorOfRankAndLengthAndType<
   [1], [16, 8, 4, 2, 1], [I1]>;
 
-// A type constraint that verifies the bitwidth of the scalar integer returned
-// from 'arm_sme.get_tile_id' matches the element bitwidth of a "virtual tile".
-def TileElementWidthMatchesTileID : TypesMatchWith<
-  "`tile_id` has the same number of bits as elements in `vector`",
-  "vector", "tile_id",
-  "IntegerType::get("
-      "$_self.getContext(),"
-      "::llvm::isa<IntegerType>(::llvm::cast<VectorType>($_self).getElementType())"
-          "? ::llvm::cast<IntegerType>("
-                  "::llvm::cast<VectorType>($_self).getElementType())"
-                  ".getWidth()"
-          ": ::llvm::cast<FloatType>("
-                  "::llvm::cast<VectorType>($_self).getElementType())"
-                  ".getWidth())">;
-
-//===----------------------------------------------------------------------===//
-// ArmSME attr definitions
-//===----------------------------------------------------------------------===//
-
-def TileSliceLayout : I32EnumAttr<"TileSliceLayout", "Layout of a tile slice", [
-  I32EnumAttrCase<"Horizontal", 0, "horizontal">,
-  I32EnumAttrCase<"Vertical", 1, "vertical">,
-]> {
-  let cppNamespace = "::mlir::arm_sme";
-  let genSpecializedAttr = 0;
-}
-
-/// An attribute that specifies the layout of a tile slice in a tile.
-def ArmSME_TileSliceLayoutAttr : EnumAttr<ArmSME_Dialect, TileSliceLayout,
-                                          "layout"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
-//===----------------------------------------------------------------------===//
-// ArmSME op definitions
-//===----------------------------------------------------------------------===//
-
-class ArmSME_Op<string mnemonic, list<Trait> traits = []> :
-  Op<ArmSME_Dialect, mnemonic, traits> {}
-
-def CastTileToVector : ArmSME_Op<"cast_tile_to_vector", [Pure, TileElementWidthMatchesTileID]> {
-  let summary = "Cast from tile id to 2-d scalable vector type";
-  let description = [{
-    A `cast_tile_to_vector` operation does a cast from a tile id to a 2-d
-    scalable vector type, which represents an SME "virtual tile". This would
-    normally be used when lowering operations that return "virtual tile" vector
-    types to model the output. This is required to preserve dataflow as SME
-    intrinsics have no return values.
-
-    Example:
-
-    Input:
-    ```mlir
-    %tile = vector.load %mem1[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
-    vector.store %tile, %mem2[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
-    ```
-
-    After lowering `vector.load`:
-    ```mlir
-    %tile_id = arm_sme.get_tile_id : i32
-    scf.for %vnum = %c0 to %num_vectors step %c1 {
-      // ...
-      "arm_sme.intr.ld1w.horiz"(%pg, %ptr, %tile_id, %vnum) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-    }
-    %tile = arm_sme.cast_tile_to_vector %tile_id : i32 to vector<[4]x[4]xi32>
-    vector.store %tile, %mem2[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
-    ```
-
-    In the example above, the `vector.load` can't be replaced with an SME
-    intrinsic that has no outputs since it is used by the `vector.store`.
-    However, by inserting a `cast_tile_to_vector` op after the load intrinsics
-    the `vector.load` can be replaced. This enables "local" rewrites on
-    individual vector ops, rather than "global" rewrites that would have to
-    look at the vector op uses and also lower them.
-
-    Canonicalization will look through `arm_sme.cast_tile_to_vector` and fold
-    the cast away if it comes from a `arm_sme.cast_vector_to_tile`.
-  }];
-  let arguments = (ins AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
-  let results = (outs SMETile:$vector);
-  let assemblyFormat =
-    "$tile_id attr-dict `:` type($tile_id) `to` type($vector)";
-  let hasCanonicalizeMethod = 1;
-}
-
-def CastVectorToTile : ArmSME_Op<"cast_vector_to_tile", [Pure, TileElementWidthMatchesTileID]> {
-  let summary = "Cast from 2-d scalable vector type to tile id";
-  let description = [{
-    A `cast_vector_to_tile` operation does a cast from a 2-d scalable vector
-    type, which represents an SME "virtual tile", to a tile id. This is
-    required to preserve dataflow as the SME intrinsics have no return values.
-
-    Example:
-
-    Input:
-    ```mlir
-    %tile = vector.load %mem1[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
-    vector.store %tile, %mem2[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
-    ```
-
-    After lowering `vector.store`:
-    ```mlir
-    %tile = vector.load %mem1[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
-    scf.for %vnum = %c0 to %num_vectors step %c1 {
-      // ...
-      %tile_id = arm_sme.cast_vector_to_tile %tile : (vector<[4]x[4]xi32>) -> i32
-      "arm_sme.intr.st1w.horiz"(%pg, %ptr, %tile_id, %vnum) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-    }
-    ```
-
-    Canonicalization will look through `arm_sme.cast_vector_to_tile` and fold
-    the cast away if it comes from a `arm_sme.cast_tile_to_vector`.
-  }];
-  let arguments = (ins SMETile:$vector);
-  let results = (outs AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
-  let assemblyFormat =
-    "$vector attr-dict `:` type($vector) `to` type($tile_id)";
-  let hasCanonicalizeMethod = 1;
-}
-
-def GetTileID : ArmSME_Op<"get_tile_id"> {
-  let summary = "Returns an SME \"virtual tile\" id";
-  let description = [{
-    A `get_tile_id` operation returns a scalar integer representing an SME
-    "virtual tile" id. The bitwidth of the scalar indicates the element
-    bitwidth of the "virtual tile".
-
-    The scope of a tile id is a function and cannot be passed or returned from
-    functions.
-
-    Example:
-    ```mlir
-    // Allocate and return an 8-bit element "virtual tile" id
-    %za0_b = arm_sme.get_tile_id : i8
-    ```
-
-    Example:
-    ```
-    // Allocate and return two 16-bit element "virtual tile" ids
-    %za0_h = arm_sme.get_tile_id : i16
-    %za1_h = arm_sme.get_tile_id : i16
-    ```
-
-    Example:
-    ```
-    // Allocate and return an 128-bit element "virtual tile" id
-    %za0_q = arm_sme.get_tile_id : i128
-    ```
-  }];
-
-  let results = (outs AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
-  let assemblyFormat = "attr-dict `:` type($tile_id)";
-}
-
-//
-// Tile reset.
-//
-
-def ZeroOp : ArmSME_Op<"zero", [Pure]> {
-  let summary = "Initialize the two-dimensional ZA array with 0s";
-  let results = (outs SMETile:$res);
-  let description = [{
-    Initialise ZA with 0. This operation is convenient wrapper for the SME
-    `zero` intrinsic and instruction. 
-
-    Example 1: Zero an 8-bit element ZA tile.
-
-    ```mlir
-    %0 = arm_sme.zero : vector<[16]x[16]xi8>
-    ```
-
-    Example 2: Zero a 64-bit element ZA tile.
-
-    ```mlir
-    %0 = arm_sme.zero : vector<[2]x[2]xi64>
-    ```
-  }];
-  let extraClassDeclaration = [{
-    VectorType getVectorType() {
-      return ::llvm::cast<VectorType>(getRes().getType());
-    }
-  }];
-  let assemblyFormat = "attr-dict `:` type($res)";
-}
-
-def TileLoadOp : ArmSME_Op<"tile_load"> {
-  let summary = "Tile load operation";
-  let description = [{
-    Loads a 2D SME "virtual tile" from memory defined by a base and indices,
-    with the shape defined by the 2D scalable vector type of the result tile.
-    An optional tile slice layout attribute specifies whether the slices of the
-    tile being loaded are horizontal (default) or vertical. The slice of memory
-    must be contiguous. The memref must be either rank 1 or rank 2 with dynamic
-    dimensions, since the operation is scalable, and the element type must be a
-    scalar that matches the element type of the result.
-
-    Example 1: Load an 8-bit element ZA tile with horizontal layout (default) from memory (ZA0.B).
-    ```mlir
-    %tile = arm_sme.tile_load %base[%c0, %c0] : memref<?x?xi8>, vector<[16]x[16]xi8>
-    ```
-
-    Example 2: Load a FP 32-bit element ZA tile with vertical layout from memory.
-    ```mlir
-    %tile = arm_sme.tile_load %base[%c0, %c0], <vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
-    ```
-
-    Example 3: Load a 128-bit element ZA tile with horizontal layout (default) from memory.
-    ```mlir
-    %tile = arm_sme.tile_load %base[%c0, %c0], <horizontal> : memref<?x?xi128>, vector<[1]x[1]xi128>
-    ```
-  }];
-  let arguments = (ins
-    Arg<AnyMemRef, "the reference to load from", [MemRead]>:$base,
-    Variadic<Index>:$indices,
-    DefaultValuedAttr<ArmSME_TileSliceLayoutAttr,
-                      "::mlir::arm_sme::TileSliceLayout::Horizontal">:$layout
-  );
-  let results = (outs SMETile:$result);
-
-  let extraClassDeclaration = [{
-    MemRefType getMemRefType() {
-      return ::llvm::cast<MemRefType>(getBase().getType());
-    }
-    VectorType getVectorType() {
-      return ::llvm::cast<VectorType>(getResult().getType());
-    }
-  }];
-
-  let assemblyFormat =
-    "$base `[` $indices `]` (`,` $layout^)? attr-dict "
-      "`:` type($base) `,` type($result)";
-}
-
-def TileStoreOp : ArmSME_Op<"tile_store"> {
-  let summary = "Tile store operation";
-  let description = [{
-    Stores a 2D SME "virtual tile" to memory defined by a base and indices,
-    with the shape defined by the 2D scalable vector type of the tile being
-    stored. An optional tile slice layout attribute specifies whether the
-    slices of the tile being stored are horizontal (default) or vertical. The
-    slice of memory must be contiguous. The memref must be either rank 1 or
-    rank 2 with dynamic dimensions, since the operation is scalable, and the
-    element type must be a scalar that matches the element type of the result.
-
-    Example 1: Store an 8-bit element ZA tile with horizontal (default) layout to memory (ZA0.B).
-    ```mlir
-    arm_sme.tile_store %tile, %base[%c0, %c0] : vector<[16]x[16]xi8>, memref<?x?xi8>
-    ```
-
-    Example 2: Store a FP 32-bit element ZA tile with vertical layout to memory.
-    ```mlir
-    arm_sme.tile_store %tile, %base[%c0, %c0], <vertical> : vector<[4]x[4]xf32>, memref<?x?xf32>
-    ```
-
-    Example 3: Store a 128-bit element ZA tile with horizontal (default) layout to memory.
-    ```mlir
-    arm_sme.tile_store %tile, %base[%c0, %c0], <horizontal> : vector<[1]x[1]xi128>, memref<?x?xi128>
-    ```
-  }];
-  let arguments = (ins SMETile:$valueToStore,
-    Arg<AnyMemRef, "the reference to store to", [MemWrite]>:$base,
-    Variadic<Index>:$indices,
-    DefaultValuedAttr<ArmSME_TileSliceLayoutAttr,
-                      "::mlir::arm_sme::TileSliceLayout::Horizontal">:$layout
-  );
-  let extraClassDeclaration = [{
-    MemRefType getMemRefType() {
-      return ::llvm::cast<MemRefType>(getBase().getType());
-    }
-    VectorType getVectorType() {
-      return ::llvm::cast<VectorType>(getValueToStore().getType());
-    }
-  }];
-
-  let assemblyFormat =
-    "$valueToStore `,` $base `[` $indices `]` (`,` $layout^)? attr-dict "
-      "`:` type($base) `,` type($valueToStore)";
-}
-
-def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
-    AllTypesMatch<["tile", "result"]>
-]> {
-  let summary = "Tile slice load and update operation";
-  let description = [{
-    Loads a 1D tile slice from memory into a 2D SME "virtual tile". The tile
-    slice is defined by the dimension of the 2D scalable vector type pointed by
-    the index. A tile slice index describes where in the input tile the tile
-    slice is loaded to. An optional tile slice layout attribute specifies
-    whether the tile slice being loaded at the given index is horizontal
-    (default) or vertical. The updated tile is returned as the result.
-
-    The slice of memory read is defined by a base and indices and must be
-    contiguous. The memref must be either rank 1 or rank 2, have dynamic
-    dimensions since the operation is scalable, and the element type must be a
-    scalar that matches the element type of the result.
-
-    Example 1: Load a vector<[16]xi8> tile slice from memory into tile horizontally (default) at given index.
-    ```mlir
-    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]x[16]xi8>
-    ```
-
-    Example 2: Load a vector<[4]xf32> tile slice from memory into tile vertically at given index.
-    ```mlir
-    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index, <vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
-    ```
-
-    Example 3: Load a vector<[1]xi128> tile slice from memory into tile vertically at given index.
-    ```mlir
-    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index, <vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
-    ```
-  }];
-  let arguments = (ins
-    Arg<AnyMemRef, "the reference to load from">:$base,
-    SMETile:$tile, Variadic<Index>:$indices, Index:$tile_slice_index,
-    DefaultValuedAttr<ArmSME_TileSliceLayoutAttr,
-                      "::mlir::arm_sme::TileSliceLayout::Horizontal">:$layout
-  );
-  let results = (outs SMETile:$result);
-
-  let extraClassDeclaration = [{
-    MemRefType getMemRefType() {
-      return ::llvm::cast<MemRefType>(getBase().getType());
-    }
-    VectorType getVectorType() {
-      return ::llvm::cast<VectorType>(getResult().getType());
-    }
-  }];
-
-  let assemblyFormat = [{
-    $base `[` $indices `]` `,` $tile `,` $tile_slice_index (`,` $layout^)?
-      attr-dict `:` type($base) `,` type($result)
-  }];
-}
-
-def StoreTileSliceOp : ArmSME_Op<"store_tile_slice"> {
-  let summary = "Tile slice store operation";
-  let description = [{
-    Stores a 1D tile slice from a 2D SME "virtual tile" into memory. The tile
-    slice is defined by the dimension of the 2D scalable vector type pointed by
-    the index. A tile slice index describes where in the input tile the tile
-    slice is stored from. An optional tile slice layout attribute specifies
-    whether the tile slice being stored from the given index is horizontal
-    (default) or vertical.
-
-    The slice of memory written is defined by a base and indices and must be
-    contiguous. The memref must be either rank 1 or rank 2, have dynamic
-    dimensions since the operation is scalable, and the element type must be a
-    scalar that matches the element type of the input tile.
-
-    Example 1: Store vector<[16]xi8> horizontal (default) tile slice from tile at given index to memory.
-    ```mlir
-    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0] : vector<[16]x[16]xi8>, memref<?x?xi8>
-    ```
-
-    Example 2: Store vector<[4]xf32> vertical tile slice from tile at given index to memory.
-    ```mlir
-    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0], <vertical> : vector<[4]x[4]xf32>, memref<?x?xf32>
-    ```
-
-    Example 3: Store a vector<[1]xi128> vertical tile slice from tile at given index to memory.
-    ```mlir
-    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0], <vertical> : vector<[1]x[1]xi128>, memref<?x?xi128>
-    ```
-  }];
-  let arguments = (ins SMETile:$tile, Index:$tile_slice_index,
-    Arg<AnyMemRef, "the reference to store to", [MemWrite]>:$base,
-    Variadic<Index>:$indices,
-    DefaultValuedAttr<ArmSME_TileSliceLayoutAttr,
-                      "::mlir::arm_sme::TileSliceLayout::Horizontal">:$layout
-  );
-  let extraClassDeclaration = [{
-    MemRefType getMemRefType() {
-      return ::llvm::cast<MemRefType>(getBase().getType());
-    }
-    VectorType getVectorType() {
-      return ::llvm::cast<VectorType>(getTile().getType());
-    }
-  }];
-
-  let assemblyFormat = [{
-    $tile `,` $tile_slice_index `,` $base `[` $indices `]` (`,` $layout^)?
-      attr-dict `:` type($base) `,` type($tile)
-  }];
-}
-
-def MoveVectorToTileSliceOp : ArmSME_Op<"move_vector_to_tile_slice", [
-    AllTypesMatch<["tile", "result"]>,
-    TypesMatchWith<
-      "type of 'vector' matches type of 'tile' slice",
-      "tile", "vector",
-      "VectorType::get("
-        "::llvm::cast<mlir::VectorType>($_self).getShape().drop_front(),"
-        "::llvm::cast<mlir::VectorType>($_self).getElementType(),"
-        "/*scalableDims=*/{true})">,
-]> {
-  let summary = "Move 1-D scalable vector to slice of 2-D tile";
-  let description = [{
-    The vector to tile slice operation moves a 1-D scalable vector to a slice
-    of a 2-D scalable vector tile at the given index. The type of the 1-D
-    scalable vector to be moved must match the type of the tile slice. A tile
-    slice is a 1-D vector of horizontally or vertically contiguous elements
-    within a ZA tile. Horizontal tile slices are currently assumed when
-    lowering to intrinsics. The updated tile is returned as the result.
-
-    Example 1: Move a vector<[16]xi8> into tile at given index.
-    ```mlir
-    %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[16]xi8> into vector<[16]x[16]xi8>
-    ```
-
-    Example 2: Move a vector<[2]xf64> into tile at given index.
-    ```mlir
-    %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[2]xf64> into vector<[2]x[2]xf64>
-    ```
-  }];
-  let arguments = (ins
-      SVEVector:$vector, SMETile:$tile, Index:$tile_slice_index);
-  let results = (outs SMETile:$result);
-
-  let extraClassDeclaration = [{
-    VectorType getTileType() {
-      return ::llvm::cast<VectorType>(getTile().getType());
-    }
-  }];
-
-  let assemblyFormat = [{
-    $vector `,` $tile `,` $tile_slice_index
-      attr-dict `:` type($vector) `into` type($result)
-  }];
-}
-
-def MoveTileSliceToVectorOp : ArmSME_Op<"move_tile_slice_to_vector", [Pure,
-    TypesMatchWith<
-      "type of 'result' matches type of 'tile' slice",
-      "tile", "result",
-      "VectorType(VectorType::Builder(::llvm::cast<mlir::VectorType>($_self)).dropDim(0))">,
-]> {
-  let summary = "Move slice of a 2-D tile to a 1-D scalable vector";
-  let description = [{
-    The tile slice to vector operation extracts a 1-D scalable slice from a 2-D
-    scalable tile at the given index. A tile slice is a 1-D vector of
-    horizontally or vertically contiguous elements within a ZA tile. Horizontal
-    tile slices are currently assumed when lowering to intrinsics.
-
-    Example 1: Extract `vector<[16]xi8>` from tile at the given index.
-    ```mlir
-    %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[16]xi8> from vector<[16]x[16]xi8>
-    ```
-
-    Example 2: Extract `vector<[2]xf64>` from tile at the given index.
-    ```mlir
-    %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[2]xf64> from vector<[2]x[2]xf64>
-    ```
-  }];
-
-  let arguments = (ins SMETile:$tile, Index:$tile_slice_index);
-  let results = (outs SVEVector:$result);
-
-  let extraClassDeclaration = [{
-    VectorType getSliceType() { return getResult().getType(); }
-  }];
-
-  let assemblyFormat = [{
-      $tile `[` $tile_slice_index `]` attr-dict
-      `:` type($result) `from` type($tile)
-  }];
-}
-
-//===----------------------------------------------------------------------===//
-// ArmSME Intrinsic op definitions
-//===----------------------------------------------------------------------===//
-
-def MOPPredicate : ScalableVectorOfLengthAndType<[16, 8, 4, 2], [I1]>;
-def MOPVector : ScalableVectorOfLengthAndType<[16, 8, 4, 2],
-                                              [I8, I16, BF16, F16, F32, F64]>;
-def LDSTPredicate : ScalableVectorOfLengthAndType<[16, 8, 4, 2, 1], [I1]>;
-
-class ArmSME_IntrOp<string mnemonic, list<int> overloadedOperands = [],
-                    list<Trait> traits = [], int numResults = 0,
-                    list<int> overloadedResults = []>
-    : LLVM_IntrOpBase<
-          /*Dialect dialect=*/ArmSME_Dialect,
-          /*string opName=*/"intr." # mnemonic,
-          /*string enumName=*/"aarch64_sme_" # !subst(".", "_", mnemonic),
-          /*list<int> overloadedResults=*/overloadedResults,
-          /*list<int> overloadedOperands=*/overloadedOperands,
-          /*list<Trait> traits=*/traits,
-          /*int numResults=*/numResults>;
-
-// Zero
-def LLVM_aarch64_sme_zero : ArmSME_IntrOp<"zero">,
-                            Arguments<(ins Arg<I32, "Tile mask">)>;
-
-// MOP's
-class ArmSME_IntrMopOverloadedOp<string mnemonic>
-    : ArmSME_IntrOp<mnemonic, [4]>,
-      Arguments<(ins Arg<I32, "Virtual tile ID">,
-                 Arg<MOPPredicate, "LHS predicate">,
-                 Arg<MOPPredicate, "RHS predicate">,
-                 Arg<MOPVector, "LHS vector operand">,
-                 Arg<MOPVector, "RHS vector operand">)>;
-
-def LLVM_aarch64_sme_mopa : ArmSME_IntrMopOverloadedOp<"mopa">;
-def LLVM_aarch64_sme_mops : ArmSME_IntrMopOverloadedOp<"mops">;
-def LLVM_aarch64_sme_mopa_wide : ArmSME_IntrMopOverloadedOp<"mopa.wide">;
-def LLVM_aarch64_sme_mops_wide : ArmSME_IntrMopOverloadedOp<"mops.wide">;
-def LLVM_aarch64_sme_smopa_wide : ArmSME_IntrMopOverloadedOp<"smopa.wide">;
-def LLVM_aarch64_sme_smops_wide : ArmSME_IntrMopOverloadedOp<"smops.wide">;
-def LLVM_aarch64_sme_umopa_wide : ArmSME_IntrMopOverloadedOp<"umopa.wide">;
-def LLVM_aarch64_sme_umops_wide : ArmSME_IntrMopOverloadedOp<"umops.wide">;
-def LLVM_aarch64_sme_sumopa_wide : ArmSME_IntrMopOverloadedOp<"sumopa.wide">;
-def LLVM_aarch64_sme_sumops_wide : ArmSME_IntrMopOverloadedOp<"sumops.wide">;
-def LLVM_aarch64_sme_usmopa_wide : ArmSME_IntrMopOverloadedOp<"usmopa.wide">;
-def LLVM_aarch64_sme_usmops_wide : ArmSME_IntrMopOverloadedOp<"usmops.wide">;
-
-// Loads
-class ArmSME_IntrLoadOp<string mnemonic>
-    : ArmSME_IntrOp<mnemonic>,
-      Arguments<(ins Arg<LDSTPredicate, "Vector predicate">,
-                 Arg<LLVM_AnyPointer, "Load address">,
-                 Arg<I32, "Virtual tile ID">,
-                 Arg<I32, "Tile slice">)>;
-
-def LLVM_aarch64_sme_ld1b_horiz : ArmSME_IntrLoadOp<"ld1b.horiz">;
-def LLVM_aarch64_sme_ld1h_horiz : ArmSME_IntrLoadOp<"ld1h.horiz">;
-def LLVM_aarch64_sme_ld1w_horiz : ArmSME_IntrLoadOp<"ld1w.horiz">;
-def LLVM_aarch64_sme_ld1d_horiz : ArmSME_IntrLoadOp<"ld1d.horiz">;
-def LLVM_aarch64_sme_ld1q_horiz : ArmSME_IntrLoadOp<"ld1q.horiz">;
-def LLVM_aarch64_sme_ld1b_vert : ArmSME_IntrLoadOp<"ld1b.vert">;
-def LLVM_aarch64_sme_ld1h_vert : ArmSME_IntrLoadOp<"ld1h.vert">;
-def LLVM_aarch64_sme_ld1w_vert : ArmSME_IntrLoadOp<"ld1w.vert">;
-def LLVM_aarch64_sme_ld1d_vert : ArmSME_IntrLoadOp<"ld1d.vert">;
-def LLVM_aarch64_sme_ld1q_vert : ArmSME_IntrLoadOp<"ld1q.vert">;
-
-// Stores
-class ArmSME_IntrStoreOp<string mnemonic>
-    : ArmSME_IntrOp<mnemonic>,
-      Arguments<(ins Arg<LDSTPredicate, "Vector predicate">,
-                 Arg<LLVM_AnyPointer, "Store address", [MemWrite]>,
-                 Arg<I32, "Virtual tile ID">,
-                 Arg<I32, "Tile slice">)>;
-
-def LLVM_aarch64_sme_st1b_horiz : ArmSME_IntrStoreOp<"st1b.horiz">;
-def LLVM_aarch64_sme_st1h_horiz : ArmSME_IntrStoreOp<"st1h.horiz">;
-def LLVM_aarch64_sme_st1w_horiz : ArmSME_IntrStoreOp<"st1w.horiz">;
-def LLVM_aarch64_sme_st1d_horiz : ArmSME_IntrStoreOp<"st1d.horiz">;
-def LLVM_aarch64_sme_st1q_horiz : ArmSME_IntrStoreOp<"st1q.horiz">;
-def LLVM_aarch64_sme_st1b_vert : ArmSME_IntrStoreOp<"st1b.vert">;
-def LLVM_aarch64_sme_st1h_vert : ArmSME_IntrStoreOp<"st1h.vert">;
-def LLVM_aarch64_sme_st1w_vert : ArmSME_IntrStoreOp<"st1w.vert">;
-def LLVM_aarch64_sme_st1d_vert : ArmSME_IntrStoreOp<"st1d.vert">;
-def LLVM_aarch64_sme_st1q_vert : ArmSME_IntrStoreOp<"st1q.vert">;
-
-def LLVM_aarch64_sme_str
-    : ArmSME_IntrOp<"str">,
-      Arguments<(ins Arg<I32, "Index">,
-                 Arg<LLVM_AnyPointer, "Store address", [MemWrite]>)>;
-
-// Vector to tile slice
-class LLVM_aarch64_sme_write<string direction>
-    : ArmSME_IntrOp<"write." # direction, /*overloadedOperands=*/[3],
-                    [AllShapesMatch<["pg", "vector"]>]>,
-      Arguments<(ins Arg<I32, "Virtual tile ID">,
-                     Arg<I32, "Tile slice">,
-                     Arg<SVEPredicate, "Vector predicate">:$pg,
-                     Arg<SVEVector, "Vector operand">:$vector)>;
-
-// Tile slice to vector
-class LLVM_aarch64_sme_read<string direction>
-    : ArmSME_IntrOp<"read." # direction, /*overloadedOperands=*/[],
-                    [AllShapesMatch<["vector", "pg", "res"]>,
-                     AllElementTypesMatch<["vector", "res"]>],
-                    /*numResults=*/1, /*overloadedResults=*/[0]>,
-      Arguments<(ins Arg<SVEVector, "Vector operand">:$vector,
-                     Arg<SVEPredicate, "Vector predicate">:$pg,
-                     Arg<I32, "Virtual tile ID">,
-                     Arg<I32, "Tile slice">)>;
-
-def LLVM_aarch64_sme_write_horiz : LLVM_aarch64_sme_write<"horiz">;
-def LLVM_aarch64_sme_write_vert : LLVM_aarch64_sme_write<"vert">;
-
-def LLVM_aarch64_sme_read_horiz : LLVM_aarch64_sme_read<"horiz">;
-def LLVM_aarch64_sme_read_vert : LLVM_aarch64_sme_read<"vert">;
-
-def LLVM_aarch64_sme_za_enable : ArmSME_IntrOp<"za.enable">;
-def LLVM_aarch64_sme_za_disable : ArmSME_IntrOp<"za.disable">;
 
-#endif // ARMSME_OPS
+#endif // ARMSME
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
new file mode 100644
index 00000000000000..feeac3b8a0355f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -0,0 +1,137 @@
+//===-- ArmSMEIntrinsicOps.td ------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions of the intrinsic Ops for the ArmSME dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSME_INTRINSIC_OPS
+#define ARMSME_INTRINSIC_OPS
+
+include "ArmSME.td"
+
+//===----------------------------------------------------------------------===//
+// ArmSME Intrinsic op definitions
+//===----------------------------------------------------------------------===//
+
+def MOPPredicate : ScalableVectorOfLengthAndType<[16, 8, 4, 2], [I1]>;
+def MOPVector : ScalableVectorOfLengthAndType<[16, 8, 4, 2],
+                                              [I8, I16, BF16, F16, F32, F64]>;
+def LDSTPredicate : ScalableVectorOfLengthAndType<[16, 8, 4, 2, 1], [I1]>;
+
+class ArmSME_IntrOp<string mnemonic, list<int> overloadedOperands = [],
+                    list<Trait> traits = [], int numResults = 0,
+                    list<int> overloadedResults = []>
+    : LLVM_IntrOpBase<
+          /*Dialect dialect=*/ArmSME_Dialect,
+          /*string opName=*/"intr." # mnemonic,
+          /*string enumName=*/"aarch64_sme_" # !subst(".", "_", mnemonic),
+          /*list<int> overloadedResults=*/overloadedResults,
+          /*list<int> overloadedOperands=*/overloadedOperands,
+          /*list<Trait> traits=*/traits,
+          /*int numResults=*/numResults>;
+
+// Zero
+def LLVM_aarch64_sme_zero : ArmSME_IntrOp<"zero">,
+                            Arguments<(ins Arg<I32, "Tile mask">)>;
+
+// MOP's
+class ArmSME_IntrMopOverloadedOp<string mnemonic>
+    : ArmSME_IntrOp<mnemonic, [4]>,
+      Arguments<(ins Arg<I32, "Virtual tile ID">,
+                 Arg<MOPPredicate, "LHS predicate">,
+                 Arg<MOPPredicate, "RHS predicate">,
+                 Arg<MOPVector, "LHS vector operand">,
+                 Arg<MOPVector, "RHS vector operand">)>;
+
+def LLVM_aarch64_sme_mopa : ArmSME_IntrMopOverloadedOp<"mopa">;
+def LLVM_aarch64_sme_mops : ArmSME_IntrMopOverloadedOp<"mops">;
+def LLVM_aarch64_sme_mopa_wide : ArmSME_IntrMopOverloadedOp<"mopa.wide">;
+def LLVM_aarch64_sme_mops_wide : ArmSME_IntrMopOverloadedOp<"mops.wide">;
+def LLVM_aarch64_sme_smopa_wide : ArmSME_IntrMopOverloadedOp<"smopa.wide">;
+def LLVM_aarch64_sme_smops_wide : ArmSME_IntrMopOverloadedOp<"smops.wide">;
+def LLVM_aarch64_sme_umopa_wide : ArmSME_IntrMopOverloadedOp<"umopa.wide">;
+def LLVM_aarch64_sme_umops_wide : ArmSME_IntrMopOverloadedOp<"umops.wide">;
+def LLVM_aarch64_sme_sumopa_wide : ArmSME_IntrMopOverloadedOp<"sumopa.wide">;
+def LLVM_aarch64_sme_sumops_wide : ArmSME_IntrMopOverloadedOp<"sumops.wide">;
+def LLVM_aarch64_sme_usmopa_wide : ArmSME_IntrMopOverloadedOp<"usmopa.wide">;
+def LLVM_aarch64_sme_usmops_wide : ArmSME_IntrMopOverloadedOp<"usmops.wide">;
+
+// Loads
+class ArmSME_IntrLoadOp<string mnemonic>
+    : ArmSME_IntrOp<mnemonic>,
+      Arguments<(ins Arg<LDSTPredicate, "Vector predicate">,
+                 Arg<LLVM_AnyPointer, "Load address">,
+                 Arg<I32, "Virtual tile ID">,
+                 Arg<I32, "Tile slice">)>;
+
+def LLVM_aarch64_sme_ld1b_horiz : ArmSME_IntrLoadOp<"ld1b.horiz">;
+def LLVM_aarch64_sme_ld1h_horiz : ArmSME_IntrLoadOp<"ld1h.horiz">;
+def LLVM_aarch64_sme_ld1w_horiz : ArmSME_IntrLoadOp<"ld1w.horiz">;
+def LLVM_aarch64_sme_ld1d_horiz : ArmSME_IntrLoadOp<"ld1d.horiz">;
+def LLVM_aarch64_sme_ld1q_horiz : ArmSME_IntrLoadOp<"ld1q.horiz">;
+def LLVM_aarch64_sme_ld1b_vert : ArmSME_IntrLoadOp<"ld1b.vert">;
+def LLVM_aarch64_sme_ld1h_vert : ArmSME_IntrLoadOp<"ld1h.vert">;
+def LLVM_aarch64_sme_ld1w_vert : ArmSME_IntrLoadOp<"ld1w.vert">;
+def LLVM_aarch64_sme_ld1d_vert : ArmSME_IntrLoadOp<"ld1d.vert">;
+def LLVM_aarch64_sme_ld1q_vert : ArmSME_IntrLoadOp<"ld1q.vert">;
+
+// Stores
+class ArmSME_IntrStoreOp<string mnemonic>
+    : ArmSME_IntrOp<mnemonic>,
+      Arguments<(ins Arg<LDSTPredicate, "Vector predicate">,
+                 Arg<LLVM_AnyPointer, "Store address", [MemWrite]>,
+                 Arg<I32, "Virtual tile ID">,
+                 Arg<I32, "Tile slice">)>;
+
+def LLVM_aarch64_sme_st1b_horiz : ArmSME_IntrStoreOp<"st1b.horiz">;
+def LLVM_aarch64_sme_st1h_horiz : ArmSME_IntrStoreOp<"st1h.horiz">;
+def LLVM_aarch64_sme_st1w_horiz : ArmSME_IntrStoreOp<"st1w.horiz">;
+def LLVM_aarch64_sme_st1d_horiz : ArmSME_IntrStoreOp<"st1d.horiz">;
+def LLVM_aarch64_sme_st1q_horiz : ArmSME_IntrStoreOp<"st1q.horiz">;
+def LLVM_aarch64_sme_st1b_vert : ArmSME_IntrStoreOp<"st1b.vert">;
+def LLVM_aarch64_sme_st1h_vert : ArmSME_IntrStoreOp<"st1h.vert">;
+def LLVM_aarch64_sme_st1w_vert : ArmSME_IntrStoreOp<"st1w.vert">;
+def LLVM_aarch64_sme_st1d_vert : ArmSME_IntrStoreOp<"st1d.vert">;
+def LLVM_aarch64_sme_st1q_vert : ArmSME_IntrStoreOp<"st1q.vert">;
+
+def LLVM_aarch64_sme_str
+    : ArmSME_IntrOp<"str">,
+      Arguments<(ins Arg<I32, "Index">,
+                 Arg<LLVM_AnyPointer, "Store address", [MemWrite]>)>;
+
+// Vector to tile slice
+class LLVM_aarch64_sme_write<string direction>
+    : ArmSME_IntrOp<"write." # direction, /*overloadedOperands=*/[3],
+                    [AllShapesMatch<["pg", "vector"]>]>,
+      Arguments<(ins Arg<I32, "Virtual tile ID">,
+                     Arg<I32, "Tile slice">,
+                     Arg<SVEPredicate, "Vector predicate">:$pg,
+                     Arg<SVEVector, "Vector operand">:$vector)>;
+
+// Tile slice to vector
+class LLVM_aarch64_sme_read<string direction>
+    : ArmSME_IntrOp<"read." # direction, /*overloadedOperands=*/[],
+                    [AllShapesMatch<["vector", "pg", "res"]>,
+                     AllElementTypesMatch<["vector", "res"]>],
+                    /*numResults=*/1, /*overloadedResults=*/[0]>,
+      Arguments<(ins Arg<SVEVector, "Vector operand">:$vector,
+                     Arg<SVEPredicate, "Vector predicate">:$pg,
+                     Arg<I32, "Virtual tile ID">,
+                     Arg<I32, "Tile slice">)>;
+
+def LLVM_aarch64_sme_write_horiz : LLVM_aarch64_sme_write<"horiz">;
+def LLVM_aarch64_sme_write_vert : LLVM_aarch64_sme_write<"vert">;
+
+def LLVM_aarch64_sme_read_horiz : LLVM_aarch64_sme_read<"horiz">;
+def LLVM_aarch64_sme_read_vert : LLVM_aarch64_sme_read<"vert">;
+
+def LLVM_aarch64_sme_za_enable : ArmSME_IntrOp<"za.enable">;
+def LLVM_aarch64_sme_za_disable : ArmSME_IntrOp<"za.disable">;
+
+#endif // ARMSME_INTRINSIC_OPS
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
new file mode 100644
index 00000000000000..e09092268082dd
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -0,0 +1,515 @@
+//===-- ArmSMEOps.td - ArmSME dialect operation definitions *- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ArmSME dialect ops. It also defines custom attributes
+// and types that are used to define the Ops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSME_OPS
+#define ARMSME_OPS
+
+include "ArmSME.td"
+include "mlir/IR/EnumAttr.td"
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+
+//===----------------------------------------------------------------------===//
+// ArmSME type definitions
+//===----------------------------------------------------------------------===//
+
+class SMETileType<Type datatype, list<int> dims, string description>
+  : ShapedContainerType<[datatype],
+      And<[IsVectorOfRankPred<[2]>, allDimsScalableVectorTypePred,
+           IsVectorOfShape<dims>]>,
+  description>;
+
+def nxnxv16i8  : SMETileType<I8,   [16, 16], "vector<[16]x[16]xi8>">;
+def nxnxv8i16  : SMETileType<I16,  [8,  8 ], "vector<[8]x[8]xi16>">;
+def nxnxv4i32  : SMETileType<I32,  [4,  4 ], "vector<[4]x[4]xi32>">;
+def nxnxv2i64  : SMETileType<I64,  [2,  2 ], "vector<[2]x[2]xi64>">;
+def nxnxv1i128 : SMETileType<I128, [1,  1 ], "vector<[1]x[1]xi128>">;
+
+def nxnxv8f16  : SMETileType<F16,  [8,  8 ], "vector<[8]x[8]xf16>">;
+def nxnxv8bf16 : SMETileType<BF16, [8,  8 ], "vector<[8]x[8]xbf16>">;
+def nxnxv4f32  : SMETileType<F32,  [4,  4 ], "vector<[4]x[4]xf32>">;
+def nxnxv2f64  : SMETileType<F64,  [2,  2 ], "vector<[2]x[2]xf64>">;
+
+def SMETile : AnyTypeOf<[nxnxv16i8, nxnxv8i16, nxnxv4i32, nxnxv2i64, nxnxv1i128,
+                         nxnxv8f16, nxnxv8bf16, nxnxv4f32, nxnxv2f64]>;
+
+// A type constraint that verifies the bitwidth of the scalar integer returned
+// from 'arm_sme.get_tile_id' matches the element bitwidth of a "virtual tile".
+def TileElementWidthMatchesTileID : TypesMatchWith<
+  "`tile_id` has the same number of bits as elements in `vector`",
+  "vector", "tile_id",
+  "IntegerType::get("
+      "$_self.getContext(),"
+      "::llvm::isa<IntegerType>(::llvm::cast<VectorType>($_self).getElementType())"
+          "? ::llvm::cast<IntegerType>("
+                  "::llvm::cast<VectorType>($_self).getElementType())"
+                  ".getWidth()"
+          ": ::llvm::cast<FloatType>("
+                  "::llvm::cast<VectorType>($_self).getElementType())"
+                  ".getWidth())">;
+
+//===----------------------------------------------------------------------===//
+// ArmSME attr definitions
+//===----------------------------------------------------------------------===//
+
+def TileSliceLayout : I32EnumAttr<"TileSliceLayout", "Layout of a tile slice", [
+  I32EnumAttrCase<"Horizontal", 0, "horizontal">,
+  I32EnumAttrCase<"Vertical", 1, "vertical">,
+]> {
+  let cppNamespace = "::mlir::arm_sme";
+  let genSpecializedAttr = 0;
+}
+
+/// An attribute that specifies the layout of a tile slice in a tile.
+def ArmSME_TileSliceLayoutAttr : EnumAttr<ArmSME_Dialect, TileSliceLayout,
+                                          "layout"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// ArmSME op definitions
+//===----------------------------------------------------------------------===//
+
+class ArmSME_Op<string mnemonic, list<Trait> traits = []> :
+  Op<ArmSME_Dialect, mnemonic, traits> {}
+
+def CastTileToVector : ArmSME_Op<"cast_tile_to_vector", [Pure, TileElementWidthMatchesTileID]> {
+  let summary = "Cast from tile id to 2-d scalable vector type";
+  let description = [{
+    A `cast_tile_to_vector` operation does a cast from a tile id to a 2-d
+    scalable vector type, which represents an SME "virtual tile". This would
+    normally be used when lowering operations that return "virtual tile" vector
+    types to model the output. This is required to preserve dataflow as SME
+    intrinsics have no return values.
+
+    Example:
+
+    Input:
+    ```mlir
+    %tile = vector.load %mem1[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
+    vector.store %tile, %mem2[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
+    ```
+
+    After lowering `vector.load`:
+    ```mlir
+    %tile_id = arm_sme.get_tile_id : i32
+    scf.for %vnum = %c0 to %num_vectors step %c1 {
+      // ...
+      "arm_sme.intr.ld1w.horiz"(%pg, %ptr, %tile_id, %vnum) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
+    }
+    %tile = arm_sme.cast_tile_to_vector %tile_id : i32 to vector<[4]x[4]xi32>
+    vector.store %tile, %mem2[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
+    ```
+
+    In the example above, the `vector.load` can't be replaced with an SME
+    intrinsic that has no outputs since it is used by the `vector.store`.
+    However, by inserting a `cast_tile_to_vector` op after the load intrinsics
+    the `vector.load` can be replaced. This enables "local" rewrites on
+    individual vector ops, rather than "global" rewrites that would have to
+    look at the vector op uses and also lower them.
+
+    Canonicalization will look through `arm_sme.cast_tile_to_vector` and fold
+    the cast away if it comes from a `arm_sme.cast_vector_to_tile`.
+  }];
+  let arguments = (ins AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
+  let results = (outs SMETile:$vector);
+  let assemblyFormat =
+    "$tile_id attr-dict `:` type($tile_id) `to` type($vector)";
+  let hasCanonicalizeMethod = 1;
+}
+
+def CastVectorToTile : ArmSME_Op<"cast_vector_to_tile", [Pure, TileElementWidthMatchesTileID]> {
+  let summary = "Cast from 2-d scalable vector type to tile id";
+  let description = [{
+    A `cast_vector_to_tile` operation does a cast from a 2-d scalable vector
+    type, which represents an SME "virtual tile", to a tile id. This is
+    required to preserve dataflow as the SME intrinsics have no return values.
+
+    Example:
+
+    Input:
+    ```mlir
+    %tile = vector.load %mem1[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
+    vector.store %tile, %mem2[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
+    ```
+
+    After lowering `vector.store`:
+    ```mlir
+    %tile = vector.load %mem1[%c0] : memref<?xi32>, vector<[4]x[4]xi32>
+    scf.for %vnum = %c0 to %num_vectors step %c1 {
+      // ...
+      %tile_id = arm_sme.cast_vector_to_tile %tile : (vector<[4]x[4]xi32>) -> i32
+      "arm_sme.intr.st1w.horiz"(%pg, %ptr, %tile_id, %vnum) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
+    }
+    ```
+
+    Canonicalization will look through `arm_sme.cast_vector_to_tile` and fold
+    the cast away if it comes from a `arm_sme.cast_tile_to_vector`.
+  }];
+  let arguments = (ins SMETile:$vector);
+  let results = (outs AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
+  let assemblyFormat =
+    "$vector attr-dict `:` type($vector) `to` type($tile_id)";
+  let hasCanonicalizeMethod = 1;
+}
+
+def GetTileID : ArmSME_Op<"get_tile_id"> {
+  let summary = "Returns an SME \"virtual tile\" id";
+  let description = [{
+    A `get_tile_id` operation returns a scalar integer representing an SME
+    "virtual tile" id. The bitwidth of the scalar indicates the element
+    bitwidth of the "virtual tile".
+
+    The scope of a tile id is a function and cannot be passed or returned from
+    functions.
+
+    Example:
+    ```mlir
+    // Allocate and return an 8-bit element "virtual tile" id
+    %za0_b = arm_sme.get_tile_id : i8
+    ```
+
+    Example:
+    ```
+    // Allocate and return two 16-bit element "virtual tile" ids
+    %za0_h = arm_sme.get_tile_id : i16
+    %za1_h = arm_sme.get_tile_id : i16
+    ```
+
+    Example:
+    ```
+    // Allocate and return an 128-bit element "virtual tile" id
+    %za0_q = arm_sme.get_tile_id : i128
+    ```
+  }];
+
+  let results = (outs AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
+  let assemblyFormat = "attr-dict `:` type($tile_id)";
+}
+
+//
+// Tile reset.
+//
+
+def ZeroOp : ArmSME_Op<"zero", [Pure]> {
+  let summary = "Initialize the two-dimensional ZA array with 0s";
+  let results = (outs SMETile:$res);
+  let description = [{
+    Initialise ZA with 0. This operation is convenient wrapper for the SME
+    `zero` intrinsic and instruction. 
+
+    Example 1: Zero an 8-bit element ZA tile.
+
+    ```mlir
+    %0 = arm_sme.zero : vector<[16]x[16]xi8>
+    ```
+
+    Example 2: Zero a 64-bit element ZA tile.
+
+    ```mlir
+    %0 = arm_sme.zero : vector<[2]x[2]xi64>
+    ```
+  }];
+  let extraClassDeclaration = [{
+    VectorType getVectorType() {
+      return ::llvm::cast<VectorType>(getRes().getType());
+    }
+  }];
+  let assemblyFormat = "attr-dict `:` type($res)";
+}
+
+def TileLoadOp : ArmSME_Op<"tile_load"> {
+  let summary = "Tile load operation";
+  let description = [{
+    Loads a 2D SME "virtual tile" from memory defined by a base and indices,
+    with the shape defined by the 2D scalable vector type of the result tile.
+    An optional tile slice layout attribute specifies whether the slices of the
+    tile being loaded are horizontal (default) or vertical. The slice of memory
+    must be contiguous. The memref must be either rank 1 or rank 2 with dynamic
+    dimensions, since the operation is scalable, and the element type must be a
+    scalar that matches the element type of the result.
+
+    Example 1: Load an 8-bit element ZA tile with horizontal layout (default) from memory (ZA0.B).
+    ```mlir
+    %tile = arm_sme.tile_load %base[%c0, %c0] : memref<?x?xi8>, vector<[16]x[16]xi8>
+    ```
+
+    Example 2: Load a FP 32-bit element ZA tile with vertical layout from memory.
+    ```mlir
+    %tile = arm_sme.tile_load %base[%c0, %c0], <vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+    ```
+
+    Example 3: Load a 128-bit element ZA tile with horizontal layout (default) from memory.
+    ```mlir
+    %tile = arm_sme.tile_load %base[%c0, %c0], <horizontal> : memref<?x?xi128>, vector<[1]x[1]xi128>
+    ```
+  }];
+  let arguments = (ins
+    Arg<AnyMemRef, "the reference to load from", [MemRead]>:$base,
+    Variadic<Index>:$indices,
+    DefaultValuedAttr<ArmSME_TileSliceLayoutAttr,
+                      "::mlir::arm_sme::TileSliceLayout::Horizontal">:$layout
+  );
+  let results = (outs SMETile:$result);
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return ::llvm::cast<MemRefType>(getBase().getType());
+    }
+    VectorType getVectorType() {
+      return ::llvm::cast<VectorType>(getResult().getType());
+    }
+  }];
+
+  let assemblyFormat =
+    "$base `[` $indices `]` (`,` $layout^)? attr-dict "
+      "`:` type($base) `,` type($result)";
+}
+
+def TileStoreOp : ArmSME_Op<"tile_store"> {
+  let summary = "Tile store operation";
+  let description = [{
+    Stores a 2D SME "virtual tile" to memory defined by a base and indices,
+    with the shape defined by the 2D scalable vector type of the tile being
+    stored. An optional tile slice layout attribute specifies whether the
+    slices of the tile being stored are horizontal (default) or vertical. The
+    slice of memory must be contiguous. The memref must be either rank 1 or
+    rank 2 with dynamic dimensions, since the operation is scalable, and the
+    element type must be a scalar that matches the element type of the result.
+
+    Example 1: Store an 8-bit element ZA tile with horizontal (default) layout to memory (ZA0.B).
+    ```mlir
+    arm_sme.tile_store %tile, %base[%c0, %c0] : vector<[16]x[16]xi8>, memref<?x?xi8>
+    ```
+
+    Example 2: Store a FP 32-bit element ZA tile with vertical layout to memory.
+    ```mlir
+    arm_sme.tile_store %tile, %base[%c0, %c0], <vertical> : vector<[4]x[4]xf32>, memref<?x?xf32>
+    ```
+
+    Example 3: Store a 128-bit element ZA tile with horizontal (default) layout to memory.
+    ```mlir
+    arm_sme.tile_store %tile, %base[%c0, %c0], <horizontal> : vector<[1]x[1]xi128>, memref<?x?xi128>
+    ```
+  }];
+  let arguments = (ins SMETile:$valueToStore,
+    Arg<AnyMemRef, "the reference to store to", [MemWrite]>:$base,
+    Variadic<Index>:$indices,
+    DefaultValuedAttr<ArmSME_TileSliceLayoutAttr,
+                      "::mlir::arm_sme::TileSliceLayout::Horizontal">:$layout
+  );
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return ::llvm::cast<MemRefType>(getBase().getType());
+    }
+    VectorType getVectorType() {
+      return ::llvm::cast<VectorType>(getValueToStore().getType());
+    }
+  }];
+
+  let assemblyFormat =
+    "$valueToStore `,` $base `[` $indices `]` (`,` $layout^)? attr-dict "
+      "`:` type($base) `,` type($valueToStore)";
+}
+
+def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
+    AllTypesMatch<["tile", "result"]>
+]> {
+  let summary = "Tile slice load and update operation";
+  let description = [{
+    Loads a 1D tile slice from memory into a 2D SME "virtual tile". The tile
+    slice is defined by the dimension of the 2D scalable vector type pointed by
+    the index. A tile slice index describes where in the input tile the tile
+    slice is loaded to. An optional tile slice layout attribute specifies
+    whether the tile slice being loaded at the given index is horizontal
+    (default) or vertical. The updated tile is returned as the result.
+
+    The slice of memory read is defined by a base and indices and must be
+    contiguous. The memref must be either rank 1 or rank 2, have dynamic
+    dimensions since the operation is scalable, and the element type must be a
+    scalar that matches the element type of the result.
+
+    Example 1: Load a vector<[16]xi8> tile slice from memory into tile horizontally (default) at given index.
+    ```mlir
+    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]x[16]xi8>
+    ```
+
+    Example 2: Load a vector<[4]xf32> tile slice from memory into tile vertically at given index.
+    ```mlir
+    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index, <vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+    ```
+
+    Example 3: Load a vector<[1]xi128> tile slice from memory into tile vertically at given index.
+    ```mlir
+    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index, <vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
+    ```
+  }];
+  let arguments = (ins
+    Arg<AnyMemRef, "the reference to load from">:$base,
+    SMETile:$tile, Variadic<Index>:$indices, Index:$tile_slice_index,
+    DefaultValuedAttr<ArmSME_TileSliceLayoutAttr,
+                      "::mlir::arm_sme::TileSliceLayout::Horizontal">:$layout
+  );
+  let results = (outs SMETile:$result);
+
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return ::llvm::cast<MemRefType>(getBase().getType());
+    }
+    VectorType getVectorType() {
+      return ::llvm::cast<VectorType>(getResult().getType());
+    }
+  }];
+
+  let assemblyFormat = [{
+    $base `[` $indices `]` `,` $tile `,` $tile_slice_index (`,` $layout^)?
+      attr-dict `:` type($base) `,` type($result)
+  }];
+}
+
+def StoreTileSliceOp : ArmSME_Op<"store_tile_slice"> {
+  let summary = "Tile slice store operation";
+  let description = [{
+    Stores a 1D tile slice from a 2D SME "virtual tile" into memory. The tile
+    slice is defined by the dimension of the 2D scalable vector type pointed by
+    the index. A tile slice index describes where in the input tile the tile
+    slice is stored from. An optional tile slice layout attribute specifies
+    whether the tile slice being stored from the given index is horizontal
+    (default) or vertical.
+
+    The slice of memory written is defined by a base and indices and must be
+    contiguous. The memref must be either rank 1 or rank 2, have dynamic
+    dimensions since the operation is scalable, and the element type must be a
+    scalar that matches the element type of the input tile.
+
+    Example 1: Store vector<[16]xi8> horizontal (default) tile slice from tile at given index to memory.
+    ```mlir
+    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0] : vector<[16]x[16]xi8>, memref<?x?xi8>
+    ```
+
+    Example 2: Store vector<[4]xf32> vertical tile slice from tile at given index to memory.
+    ```mlir
+    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0], <vertical> : vector<[4]x[4]xf32>, memref<?x?xf32>
+    ```
+
+    Example 3: Store a vector<[1]xi128> vertical tile slice from tile at given index to memory.
+    ```mlir
+    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0], <vertical> : vector<[1]x[1]xi128>, memref<?x?xi128>
+    ```
+  }];
+  let arguments = (ins SMETile:$tile, Index:$tile_slice_index,
+    Arg<AnyMemRef, "the reference to store to", [MemWrite]>:$base,
+    Variadic<Index>:$indices,
+    DefaultValuedAttr<ArmSME_TileSliceLayoutAttr,
+                      "::mlir::arm_sme::TileSliceLayout::Horizontal">:$layout
+  );
+  let extraClassDeclaration = [{
+    MemRefType getMemRefType() {
+      return ::llvm::cast<MemRefType>(getBase().getType());
+    }
+    VectorType getVectorType() {
+      return ::llvm::cast<VectorType>(getTile().getType());
+    }
+  }];
+
+  let assemblyFormat = [{
+    $tile `,` $tile_slice_index `,` $base `[` $indices `]` (`,` $layout^)?
+      attr-dict `:` type($base) `,` type($tile)
+  }];
+}
+
+def MoveVectorToTileSliceOp : ArmSME_Op<"move_vector_to_tile_slice", [
+    AllTypesMatch<["tile", "result"]>,
+    TypesMatchWith<
+      "type of 'vector' matches type of 'tile' slice",
+      "tile", "vector",
+      "VectorType::get("
+        "::llvm::cast<mlir::VectorType>($_self).getShape().drop_front(),"
+        "::llvm::cast<mlir::VectorType>($_self).getElementType(),"
+        "/*scalableDims=*/{true})">,
+]> {
+  let summary = "Move 1-D scalable vector to slice of 2-D tile";
+  let description = [{
+    The vector to tile slice operation moves a 1-D scalable vector to a slice
+    of a 2-D scalable vector tile at the given index. The type of the 1-D
+    scalable vector to be moved must match the type of the tile slice. A tile
+    slice is a 1-D vector of horizontally or vertically contiguous elements
+    within a ZA tile. Horizontal tile slices are currently assumed when
+    lowering to intrinsics. The updated tile is returned as the result.
+
+    Example 1: Move a vector<[16]xi8> into tile at given index.
+    ```mlir
+    %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[16]xi8> into vector<[16]x[16]xi8>
+    ```
+
+    Example 2: Move a vector<[2]xf64> into tile at given index.
+    ```mlir
+    %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[2]xf64> into vector<[2]x[2]xf64>
+    ```
+  }];
+  let arguments = (ins
+      SVEVector:$vector, SMETile:$tile, Index:$tile_slice_index);
+  let results = (outs SMETile:$result);
+
+  let extraClassDeclaration = [{
+    VectorType getTileType() {
+      return ::llvm::cast<VectorType>(getTile().getType());
+    }
+  }];
+
+  let assemblyFormat = [{
+    $vector `,` $tile `,` $tile_slice_index
+      attr-dict `:` type($vector) `into` type($result)
+  }];
+}
+
+def MoveTileSliceToVectorOp : ArmSME_Op<"move_tile_slice_to_vector", [Pure,
+    TypesMatchWith<
+      "type of 'result' matches type of 'tile' slice",
+      "tile", "result",
+      "VectorType(VectorType::Builder(::llvm::cast<mlir::VectorType>($_self)).dropDim(0))">,
+]> {
+  let summary = "Move slice of a 2-D tile to a 1-D scalable vector";
+  let description = [{
+    The tile slice to vector operation extracts a 1-D scalable slice from a 2-D
+    scalable tile at the given index. A tile slice is a 1-D vector of
+    horizontally or vertically contiguous elements within a ZA tile. Horizontal
+    tile slices are currently assumed when lowering to intrinsics.
+
+    Example 1: Extract `vector<[16]xi8>` from tile at the given index.
+    ```mlir
+    %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[16]xi8> from vector<[16]x[16]xi8>
+    ```
+
+    Example 2: Extract `vector<[2]xf64>` from tile at the given index.
+    ```mlir
+    %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[2]xf64> from vector<[2]x[2]xf64>
+    ```
+  }];
+
+  let arguments = (ins SMETile:$tile, Index:$tile_slice_index);
+  let results = (outs SVEVector:$result);
+
+  let extraClassDeclaration = [{
+    VectorType getSliceType() { return getResult().getType(); }
+  }];
+
+  let assemblyFormat = [{
+      $tile `[` $tile_slice_index `]` attr-dict
+      `:` type($result) `from` type($tile)
+  }];
+}
+
+#endif // ARMSME_OPS
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/ArmSME/IR/CMakeLists.txt
index 617809e482b2ca..9319a8042c93f4 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/CMakeLists.txt
@@ -1,12 +1,27 @@
 add_mlir_dialect(ArmSME arm_sme ArmSME)
-add_mlir_doc(ArmSME ArmSME Dialects/ -gen-dialect-doc -dialect=arm_sme)
-
-set(LLVM_TARGET_DEFINITIONS ArmSME.td)
-mlir_tablegen(ArmSMEConversions.inc -gen-llvmir-conversions)
-add_public_tablegen_target(MLIRArmSMEConversionsIncGen)
 
+# Generate declarations and definitions of ArmSME Ops
+set(LLVM_TARGET_DEFINITIONS ArmSMEOps.td)
+mlir_tablegen(ArmSMEOps.h.inc -gen-op-decls)
+mlir_tablegen(ArmSMEOps.cpp.inc -gen-op-defs)
 mlir_tablegen(ArmSMEEnums.h.inc -gen-enum-decls)
 mlir_tablegen(ArmSMEEnums.cpp.inc -gen-enum-defs)
 mlir_tablegen(ArmSMEAttrDefs.h.inc -gen-attrdef-decls -attrdefs-dialect=arm_sme)
 mlir_tablegen(ArmSMEAttrDefs.cpp.inc -gen-attrdef-defs -attrdefs-dialect=arm_sme)
-add_public_tablegen_target(MLIRArmSMEAttrDefsIncGen)
+add_public_tablegen_target(MLIRArmSMEOpsIncGen)
+
+# Generate LLVM IR Conversions
+set(LLVM_TARGET_DEFINITIONS ArmSMEOps.td)
+mlir_tablegen(ArmSMEOpsConversions.inc -gen-llvmir-conversions)
+add_public_tablegen_target(MLIRArmSMEConversionsIncGen)
+
+# Generate declarations and definitions of ArmSME intrinsic Ops
+set(LLVM_TARGET_DEFINITIONS ArmSMEIntrinsicOps.td)
+mlir_tablegen(ArmSMEIntrinsicOps.h.inc -gen-op-decls)
+mlir_tablegen(ArmSMEIntrinsicOps.cpp.inc -gen-op-defs)
+mlir_tablegen(ArmSMEIntrinsicConversions.inc -gen-llvmir-conversions)
+add_public_tablegen_target(MLIRArmSMEIntrinsicOpsIncGen)
+
+# Generate the docs
+add_mlir_doc(ArmSMEOps ArmSMEOps Dialects/ -gen-op-doc)
+add_mlir_doc(ArmSMEIntrinsicOps ArmSMEIntrinsicOps Dialects/ -gen-op-doc)
diff --git a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
index a0eb5ff00cb9fe..9b588eb610e51b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
@@ -85,6 +85,7 @@ def OneShotBufferizeOp
       DefaultValuedAttr<BoolAttr, "false">:$allow_return_allocs_from_loops,
       DefaultValuedAttr<BoolAttr, "false">:$allow_unknown_ops,
       DefaultValuedAttr<BoolAttr, "false">:$bufferize_function_boundaries,
+      DefaultValuedAttr<BoolAttr, "false">:$dump_alias_sets,
       DefaultValuedAttr<BoolAttr, "false">:$test_analysis_only,
       DefaultValuedAttr<BoolAttr, "false">:$print_conflicts,
       DefaultValuedAttr<StrAttr, "\"memref.copy\"">:$memcpy_op);
diff --git a/mlir/include/mlir/Dialect/CommonFolders.h b/mlir/include/mlir/Dialect/CommonFolders.h
index 6257e4a60188d5..7dabc781cd5952 100644
--- a/mlir/include/mlir/Dialect/CommonFolders.h
+++ b/mlir/include/mlir/Dialect/CommonFolders.h
@@ -93,8 +93,12 @@ Attribute constFoldBinaryOpConditional(ArrayRef<Attribute> operands,
     if (lhs.getType() != rhs.getType())
       return {};
 
-    auto lhsIt = lhs.value_begin<ElementValueT>();
-    auto rhsIt = rhs.value_begin<ElementValueT>();
+    auto maybeLhsIt = lhs.try_value_begin<ElementValueT>();
+    auto maybeRhsIt = rhs.try_value_begin<ElementValueT>();
+    if (!maybeLhsIt || !maybeRhsIt)
+      return {};
+    auto lhsIt = *maybeLhsIt;
+    auto rhsIt = *maybeRhsIt;
     SmallVector<ElementValueT, 4> elementResults;
     elementResults.reserve(lhs.getNumElements());
     for (size_t i = 0, e = lhs.getNumElements(); i < e; ++i, ++lhsIt, ++rhsIt) {
@@ -227,7 +231,10 @@ Attribute constFoldUnaryOpConditional(ArrayRef<Attribute> operands,
     // expanding the values.
     auto op = cast<ElementsAttr>(operands[0]);
 
-    auto opIt = op.value_begin<ElementValueT>();
+    auto maybeOpIt = op.try_value_begin<ElementValueT>();
+    if (!maybeOpIt)
+      return {};
+    auto opIt = *maybeOpIt;
     SmallVector<ElementValueT> elementResults;
     elementResults.reserve(op.getNumElements());
     for (size_t i = 0, e = op.getNumElements(); i < e; ++i, ++opIt) {
@@ -293,12 +300,14 @@ Attribute constFoldCastOp(ArrayRef<Attribute> operands, Type resType,
       return {};
     return DenseElementsAttr::get(cast<ShapedType>(resType), elementResult);
   }
-  if (isa<ElementsAttr>(operands[0])) {
+  if (auto op = dyn_cast<ElementsAttr>(operands[0])) {
     // Operand is ElementsAttr-derived; perform an element-wise fold by
     // expanding the value.
-    auto op = cast<ElementsAttr>(operands[0]);
     bool castStatus = true;
-    auto opIt = op.value_begin<ElementValueT>();
+    auto maybeOpIt = op.try_value_begin<ElementValueT>();
+    if (!maybeOpIt)
+      return {};
+    auto opIt = *maybeOpIt;
     SmallVector<TargetElementValueT> elementResults;
     elementResults.reserve(op.getNumElements());
     for (size_t i = 0, e = op.getNumElements(); i < e; ++i, ++opIt) {
diff --git a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
index 04b90ce107755f..f4b43a59047054 100644
--- a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
+++ b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
@@ -246,8 +246,8 @@ def FuncOp : Func_Op<"func", [
 
     ```mlir
     // External function definitions.
-    func.func @abort()
-    func.func @scribble(i32, i64, memref<? x 128 x f32, #layout_map0>) -> f64
+    func.func private @abort()
+    func.func private @scribble(i32, i64, memref<? x 128 x f32, #layout_map0>) -> f64
 
     // A function that returns its argument twice:
     func.func @count(%x: i64) -> (i64, i64)
@@ -256,13 +256,13 @@ def FuncOp : Func_Op<"func", [
     }
 
     // A function with an argument attribute
-    func.func @example_fn_arg(%x: i32 {swift.self = unit})
+    func.func private @example_fn_arg(%x: i32 {swift.self = unit})
 
     // A function with a result attribute
-    func.func @example_fn_result() -> (f64 {dialectName.attrName = 0 : i64})
+    func.func private @example_fn_result() -> (f64 {dialectName.attrName = 0 : i64})
 
     // A function with an attribute
-    func.func @example_fn_attr() attributes {dialectName.attrName = false}
+    func.func private @example_fn_attr() attributes {dialectName.attrName = false}
     ```
   }];
 
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/MemRef/IR/CMakeLists.txt
index be40d7bca66693..b7b12d49f9224d 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/MemRef/IR/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_mlir_dialect(MemRefOps memref)
-add_mlir_doc(MemRefOps MemRefOps Dialects/ -gen-dialect-doc -dialect=memref)
+add_mlir_doc(MemRefOps MemRefOps Dialects/ -gen-op-doc)
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 6b0ccbe37e89e9..8fa41f4e4b659f 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -646,6 +646,15 @@ def MemRef_DimOp : MemRef_Op<"dim", [
 def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
   let summary = "non-blocking DMA operation that starts a transfer";
   let description = [{
+    Syntax:
+    
+    ```
+    operation ::= `memref.dma_start` ssa-use`[`ssa-use-list`]` `,`
+                   ssa-use`[`ssa-use-list`]` `,` ssa-use `,`
+                   ssa-use`[`ssa-use-list`]` (`,` ssa-use `,` ssa-use)?
+                  `:` memref-type `,` memref-type `,` memref-type
+    ```
+
     DmaStartOp starts a non-blocking DMA operation that transfers data from a
     source memref to a destination memref. The source and destination memref
     need not be of the same dimensionality, but need to have the same elemental
@@ -686,9 +695,9 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
               %num_elt_per_stride :
     ```
 
-    TODO: add additional operands to allow source and destination striding, and
+    * TODO: add additional operands to allow source and destination striding, and
     multiple stride levels.
-    TODO: Consider replacing src/dst memref indices with view memrefs.
+    * TODO: Consider replacing src/dst memref indices with view memrefs.
   }];
   let arguments = (ins Variadic<AnyType>:$operands);
 
@@ -839,8 +848,7 @@ def MemRef_DmaWaitOp : MemRef_Op<"dma_wait"> {
                        Variadic<Index>:$tagIndices,
                        Index:$numElements);
   let assemblyFormat = [{
-    $tagMemRef `[` $tagIndices `]` `,` $numElements attr-dict `:`
-    type($tagMemRef)
+    $tagMemRef `[` $tagIndices `]` `,` $numElements attr-dict `:` type($tagMemRef)
   }];
   let extraClassDeclaration = [{
     /// Returns the rank (number of indices) of the tag memref.
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index ec250b1b65899d..57cd1a3806c2ed 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -174,7 +174,7 @@ def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.des
   let assemblyFormat = "`<` struct(params) `>`";
 }
 
-def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "wgmma.descriptor", []> {
+def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> {
   let summary = "Warpgroup matrix descriptor type";
   let description = [{
   The descriptor specifies the properties of the matrix in shared memory that 
@@ -667,11 +667,12 @@ def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
   let hasVerifier = 1;
 }
 
-def NVGPU_GenerateGmmaDescriptorOp : NVGPU_Op<"wgmma.generate.descriptor", []> {
-  let summary = "Generate a wgmma matrix descriptor";
+def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> {
+  let summary = "Generate a warpgroup matrix descriptor";
   let description = [{
-  This Op builds a `nvgpu.wgmma.descriptor` that is used by warpgroup-level 
-  matrix multiply and accumulate.
+  This Op builds a `nvgpu.warpgroup.descriptor` that is used by 
+  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and 
+  accumulate.
 
   The descriptor specifies the properties of the matrix in shared memory that 
   is a multiplicand in the matrix multiply and accumulate operation. 
@@ -702,9 +703,9 @@ def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
 
     Example:
     ```mlir
-      %r1,%r2 = nvgpu.warpgroup.mma %wgmmaDescA, %wgmmaDescB, %acc1, %acc2: 
-                 !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>, 
-                 !nvgpu.wgmma.descriptor<tensor = memref<64x128xf16, 3>>, 
+      %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 
+                 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
+                 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
                  !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
                  !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
                  -> 
@@ -727,4 +728,24 @@ def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
   let hasVerifier = 1;
 }
 
+def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
+  let description = [{
+    The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 
+    in $matrixD to given memref. 
+
+    [See the details of register fragment layout for accumulator matrix D]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
+
+    Note that, the op must be run with warp group.
+  }];
+
+  let arguments = (ins Variadic<NVGPU_WarpgroupAccumulator>:$matrixD,
+                       Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
+  
+  let assemblyFormat = [{
+    `[` $matrixD `]` `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
+  }];
+  let hasVerifier = 1;
+}
+
 #endif // NVGPU
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index ce1b4e29cd51b9..60156cc334c72e 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -1563,12 +1563,12 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> {
   let arguments = (ins SymbolNameAttr:$sym_name,
                        SymbolNameAttr:$func_name,
                        OptionalAttr<StrAttr>:$bind_name,
-                       OptionalAttr<UnitAttr>:$gang,
-                       OptionalAttr<UnitAttr>:$worker,
-                       OptionalAttr<UnitAttr>:$vector,
-                       OptionalAttr<UnitAttr>:$seq,
-                       OptionalAttr<UnitAttr>:$nohost,
-                       OptionalAttr<UnitAttr>:$implicit,
+                       UnitAttr:$gang,
+                       UnitAttr:$worker,
+                       UnitAttr:$vector,
+                       UnitAttr:$seq,
+                       UnitAttr:$nohost,
+                       UnitAttr:$implicit,
                        OptionalAttr<APIntAttr>:$gangDim);
 
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 7a1aed509c2a36..2920ef79f461c6 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -170,33 +170,33 @@ enum class Action : uint32_t {
 // TODO: We should generalize TwoOutOfFour to N out of M and use property to
 // encode the value of N and M.
 // TODO: Update DimLevelType to use lower 8 bits for storage formats and the
-// higher 4 bits to store level properties. Consider CompressedWithHi and
+// higher 4 bits to store level properties. Consider LooseCompressed and
 // TwoOutOfFour as properties instead of formats.
 enum class DimLevelType : uint8_t {
-  Undef = 0,                 // 0b00000_00
-  Dense = 4,                 // 0b00001_00
-  Compressed = 8,            // 0b00010_00
-  CompressedNu = 9,          // 0b00010_01
-  CompressedNo = 10,         // 0b00010_10
-  CompressedNuNo = 11,       // 0b00010_11
-  Singleton = 16,            // 0b00100_00
-  SingletonNu = 17,          // 0b00100_01
-  SingletonNo = 18,          // 0b00100_10
-  SingletonNuNo = 19,        // 0b00100_11
-  CompressedWithHi = 32,     // 0b01000_00
-  CompressedWithHiNu = 33,   // 0b01000_01
-  CompressedWithHiNo = 34,   // 0b01000_10
-  CompressedWithHiNuNo = 35, // 0b01000_11
-  TwoOutOfFour = 64,         // 0b10000_00
+  Undef = 0,                // 0b00000_00
+  Dense = 4,                // 0b00001_00
+  Compressed = 8,           // 0b00010_00
+  CompressedNu = 9,         // 0b00010_01
+  CompressedNo = 10,        // 0b00010_10
+  CompressedNuNo = 11,      // 0b00010_11
+  Singleton = 16,           // 0b00100_00
+  SingletonNu = 17,         // 0b00100_01
+  SingletonNo = 18,         // 0b00100_10
+  SingletonNuNo = 19,       // 0b00100_11
+  LooseCompressed = 32,     // 0b01000_00
+  LooseCompressedNu = 33,   // 0b01000_01
+  LooseCompressedNo = 34,   // 0b01000_10
+  LooseCompressedNuNo = 35, // 0b01000_11
+  TwoOutOfFour = 64,        // 0b10000_00
 };
 
 /// This enum defines all supported storage format without the level properties.
 enum class LevelFormat : uint8_t {
-  Dense = 4,             // 0b00001_00
-  Compressed = 8,        // 0b00010_00
-  Singleton = 16,        // 0b00100_00
-  CompressedWithHi = 32, // 0b01000_00
-  TwoOutOfFour = 64,     // 0b10000_00
+  Dense = 4,            // 0b00001_00
+  Compressed = 8,       // 0b00010_00
+  Singleton = 16,       // 0b00100_00
+  LooseCompressed = 32, // 0b01000_00
+  TwoOutOfFour = 64,    // 0b10000_00
 };
 
 /// This enum defines all the nondefault properties for storage formats.
@@ -215,29 +215,29 @@ constexpr const char *toMLIRString(DimLevelType dlt) {
   case DimLevelType::Compressed:
     return "compressed";
   case DimLevelType::CompressedNu:
-    return "compressed_nu";
+    return "compressed(nonunique)";
   case DimLevelType::CompressedNo:
-    return "compressed_no";
+    return "compressed(nonordered)";
   case DimLevelType::CompressedNuNo:
-    return "compressed_nu_no";
+    return "compressed(nonunique, nonordered)";
   case DimLevelType::Singleton:
     return "singleton";
   case DimLevelType::SingletonNu:
-    return "singleton_nu";
+    return "singleton(nonunique)";
   case DimLevelType::SingletonNo:
-    return "singleton_no";
+    return "singleton(nonordered)";
   case DimLevelType::SingletonNuNo:
-    return "singleton_nu_no";
-  case DimLevelType::CompressedWithHi:
-    return "compressed_hi";
-  case DimLevelType::CompressedWithHiNu:
-    return "compressed_hi_nu";
-  case DimLevelType::CompressedWithHiNo:
-    return "compressed_hi_no";
-  case DimLevelType::CompressedWithHiNuNo:
-    return "compressed_hi_nu_no";
+    return "singleton(nonunique, nonordered)";
+  case DimLevelType::LooseCompressed:
+    return "loose_compressed";
+  case DimLevelType::LooseCompressedNu:
+    return "loose_compressed(nonunique)";
+  case DimLevelType::LooseCompressedNo:
+    return "loose_compressed(nonordered)";
+  case DimLevelType::LooseCompressedNuNo:
+    return "loose_compressed(nonunique, nonordered)";
   case DimLevelType::TwoOutOfFour:
-    return "compressed24";
+    return "block2_4";
   }
   return "";
 }
@@ -279,9 +279,9 @@ constexpr bool isCompressedDLT(DimLevelType dlt) {
 }
 
 /// Check if the `DimLevelType` is compressed (regardless of properties).
-constexpr bool isCompressedWithHiDLT(DimLevelType dlt) {
+constexpr bool isLooseCompressedDLT(DimLevelType dlt) {
   return (static_cast<uint8_t>(dlt) & ~3) ==
-         static_cast<uint8_t>(DimLevelType::CompressedWithHi);
+         static_cast<uint8_t>(DimLevelType::LooseCompressed);
 }
 
 /// Check if the `DimLevelType` is singleton (regardless of properties).
@@ -373,10 +373,10 @@ static_assert((isValidDLT(DimLevelType::Undef) &&
                isValidDLT(DimLevelType::SingletonNu) &&
                isValidDLT(DimLevelType::SingletonNo) &&
                isValidDLT(DimLevelType::SingletonNuNo) &&
-               isValidDLT(DimLevelType::CompressedWithHi) &&
-               isValidDLT(DimLevelType::CompressedWithHiNu) &&
-               isValidDLT(DimLevelType::CompressedWithHiNo) &&
-               isValidDLT(DimLevelType::CompressedWithHiNuNo) &&
+               isValidDLT(DimLevelType::LooseCompressed) &&
+               isValidDLT(DimLevelType::LooseCompressedNu) &&
+               isValidDLT(DimLevelType::LooseCompressedNo) &&
+               isValidDLT(DimLevelType::LooseCompressedNuNo) &&
                isValidDLT(DimLevelType::TwoOutOfFour)),
               "isValidDLT definition is broken");
 
@@ -391,16 +391,16 @@ static_assert((!isCompressedDLT(DimLevelType::Dense) &&
                !isCompressedDLT(DimLevelType::SingletonNuNo)),
               "isCompressedDLT definition is broken");
 
-static_assert((!isCompressedWithHiDLT(DimLevelType::Dense) &&
-               isCompressedWithHiDLT(DimLevelType::CompressedWithHi) &&
-               isCompressedWithHiDLT(DimLevelType::CompressedWithHiNu) &&
-               isCompressedWithHiDLT(DimLevelType::CompressedWithHiNo) &&
-               isCompressedWithHiDLT(DimLevelType::CompressedWithHiNuNo) &&
-               !isCompressedWithHiDLT(DimLevelType::Singleton) &&
-               !isCompressedWithHiDLT(DimLevelType::SingletonNu) &&
-               !isCompressedWithHiDLT(DimLevelType::SingletonNo) &&
-               !isCompressedWithHiDLT(DimLevelType::SingletonNuNo)),
-              "isCompressedWithHiDLT definition is broken");
+static_assert((!isLooseCompressedDLT(DimLevelType::Dense) &&
+               isLooseCompressedDLT(DimLevelType::LooseCompressed) &&
+               isLooseCompressedDLT(DimLevelType::LooseCompressedNu) &&
+               isLooseCompressedDLT(DimLevelType::LooseCompressedNo) &&
+               isLooseCompressedDLT(DimLevelType::LooseCompressedNuNo) &&
+               !isLooseCompressedDLT(DimLevelType::Singleton) &&
+               !isLooseCompressedDLT(DimLevelType::SingletonNu) &&
+               !isLooseCompressedDLT(DimLevelType::SingletonNo) &&
+               !isLooseCompressedDLT(DimLevelType::SingletonNuNo)),
+              "isLooseCompressedDLT definition is broken");
 
 static_assert((!isSingletonDLT(DimLevelType::Dense) &&
                !isSingletonDLT(DimLevelType::Compressed) &&
@@ -423,10 +423,10 @@ static_assert((isOrderedDLT(DimLevelType::Dense) &&
                isOrderedDLT(DimLevelType::SingletonNu) &&
                !isOrderedDLT(DimLevelType::SingletonNo) &&
                !isOrderedDLT(DimLevelType::SingletonNuNo) &&
-               isOrderedDLT(DimLevelType::CompressedWithHi) &&
-               isOrderedDLT(DimLevelType::CompressedWithHiNu) &&
-               !isOrderedDLT(DimLevelType::CompressedWithHiNo) &&
-               !isOrderedDLT(DimLevelType::CompressedWithHiNuNo)),
+               isOrderedDLT(DimLevelType::LooseCompressed) &&
+               isOrderedDLT(DimLevelType::LooseCompressedNu) &&
+               !isOrderedDLT(DimLevelType::LooseCompressedNo) &&
+               !isOrderedDLT(DimLevelType::LooseCompressedNuNo)),
               "isOrderedDLT definition is broken");
 
 static_assert((isUniqueDLT(DimLevelType::Dense) &&
@@ -439,10 +439,10 @@ static_assert((isUniqueDLT(DimLevelType::Dense) &&
                !isUniqueDLT(DimLevelType::SingletonNu) &&
                isUniqueDLT(DimLevelType::SingletonNo) &&
                !isUniqueDLT(DimLevelType::SingletonNuNo) &&
-               isUniqueDLT(DimLevelType::CompressedWithHi) &&
-               !isUniqueDLT(DimLevelType::CompressedWithHiNu) &&
-               isUniqueDLT(DimLevelType::CompressedWithHiNo) &&
-               !isUniqueDLT(DimLevelType::CompressedWithHiNuNo)),
+               isUniqueDLT(DimLevelType::LooseCompressed) &&
+               !isUniqueDLT(DimLevelType::LooseCompressedNu) &&
+               isUniqueDLT(DimLevelType::LooseCompressedNo) &&
+               !isUniqueDLT(DimLevelType::LooseCompressedNuNo)),
               "isUniqueDLT definition is broken");
 
 } // namespace sparse_tensor
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index 9cab6b6a027cdd..3eb9ce010cb006 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -114,10 +114,10 @@ SparseTensorEncodingAttr getSparseTensorEncoding(Type type);
 /// Convenience method to query whether a given DLT needs both position and
 /// coordinates array or only coordinates array.
 constexpr inline bool isDLTWithPos(DimLevelType dlt) {
-  return isCompressedWithHiDLT(dlt) || isCompressedDLT(dlt);
+  return isLooseCompressedDLT(dlt) || isCompressedDLT(dlt);
 }
 constexpr inline bool isDLTWithCrd(DimLevelType dlt) {
-  return isSingletonDLT(dlt) || isCompressedWithHiDLT(dlt) ||
+  return isSingletonDLT(dlt) || isLooseCompressedDLT(dlt) ||
          isCompressedDLT(dlt);
 }
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index d311fe7801cc18..cacc8176c67824 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -367,7 +367,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     bool isDenseLvl(::mlir::sparse_tensor::Level l) const { return isDenseDLT(getLvlType(l)); }
     bool isTwoOutOfFourLvl(::mlir::sparse_tensor::Level l) const { return isTwoOutOfFourDLT(getLvlType(l)); }
     bool isCompressedLvl(::mlir::sparse_tensor::Level l) const { return isCompressedDLT(getLvlType(l)); }
-    bool isCompressedWithHiLvl(::mlir::sparse_tensor::Level l) const { return isCompressedWithHiDLT(getLvlType(l)); }
+    bool isLooseCompressedLvl(::mlir::sparse_tensor::Level l) const { return isLooseCompressedDLT(getLvlType(l)); }
     bool isSingletonLvl(::mlir::sparse_tensor::Level l) const { return isSingletonDLT(getLvlType(l)); }
     bool isOrderedLvl(::mlir::sparse_tensor::Level l) const { return isOrderedDLT(getLvlType(l)); }
     bool isUniqueLvl(::mlir::sparse_tensor::Level l) const { return isUniqueDLT(getLvlType(l)); }
@@ -422,6 +422,14 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     std::optional<uint64_t> getStaticLvlSliceOffset(::mlir::sparse_tensor::Level lvl) const;
     std::optional<uint64_t> getStaticLvlSliceSize(::mlir::sparse_tensor::Level lvl) const;
     std::optional<uint64_t> getStaticLvlSliceStride(::mlir::sparse_tensor::Level lvl) const;
+
+    //
+    // Printing methods.
+    //
+
+    void printSymbols(AffineMap &map, AsmPrinter &printer) const;
+    void printDimensions(AffineMap &map, AsmPrinter &printer, ArrayRef<::mlir::sparse_tensor::SparseTensorDimSliceAttr> dimSlices) const;
+    void printLevels(AffineMap &map, AsmPrinter &printer, ArrayRef<::mlir::sparse_tensor::DimLevelType> lvlTypes) const;
   }];
 
   let genVerifyDecl = 1;
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index c566f674f7d401..7ea5ca23f122a8 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -762,7 +762,7 @@ def SparseTensor_OutOp : SparseTensor_Op<"out", []>,
 // Sparse Tensor Sorting Operations.
 //===----------------------------------------------------------------------===//
 
-def SparseTensor_SortCooOp : SparseTensor_Op<"sort_coo">,
+def SparseTensor_SortOp : SparseTensor_Op<"sort">,
     Arguments<(ins Index:$n, StridedMemRefRankOf<[AnyInteger, Index], [1]>:$xy,
                Variadic<StridedMemRefRankOf<[AnyType], [1]>>:$ys,
                AffineMapAttr:$perm_map, OptionalAttr<IndexAttr>:$ny,
@@ -770,7 +770,7 @@ def SparseTensor_SortCooOp : SparseTensor_Op<"sort_coo">,
   let summary = "Sorts the arrays in xs and ys lexicographically on the "
                 "integral values found in the xs list";
   let description = [{
-    Sparse_tensor.sort_coo sort the `xs` values along with some `ys` values
+    Sparse_tensor.sort sort the `xs` values along with some `ys` values
     that are put in a single linear buffer `xy`.
     The affine map attribute `perm_map` specifies the permutation to be applied on
     the `xs` before comparison, the rank of the permutation map
@@ -787,15 +787,9 @@ def SparseTensor_SortCooOp : SparseTensor_Op<"sort_coo">,
     Example:
 
     ```mlir
-    sparse_tensor.sort_coo insertion_sort_stable %n, %x { perm_map = affine_map<(i,j) -> (j,i)> }
+    sparse_tensor.sort insertion_sort_stable %n, %x { perm_map = affine_map<(i,j) -> (j,i)> }
       : memref<?xindex>
     ```
-
-    ```mlir
-    sparse_tensor.sort hybrid_quick_sort %n, %xy jointly %y1
-      { nx = 2 : index, ny = 2 : index}
-      : memref<?xi64> jointly memref<?xf32>
-    ```
   }];
 
   let assemblyFormat = "$algorithm $n"
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index e7da35a0c8145a..f4d9a251fb9783 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -798,7 +798,7 @@ def Tosa_MulOp : Tosa_ElementwiseOp<"mul", [
   let arguments = (ins
     Tosa_Tensor:$input1,
     Tosa_Tensor:$input2,
-    I32Attr:$shift
+    I8Attr:$shift
   );
 
   let results = (outs
diff --git a/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.h b/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.h
index c8888f294f6ca1..b155b110677d6c 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.h
+++ b/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.h
@@ -11,39 +11,71 @@
 
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/STLExtras.h"
+#include <optional>
+#include <type_traits>
 
 namespace mlir {
 namespace transform {
 class MatchOpInterface;
 
+namespace detail {
+/// Dispatch `matchOperation` based on Operation* or std::optional<Operation*>
+/// first operand.
 template <typename OpTy>
-class SingleOpMatcherOpTrait
-    : public OpTrait::TraitBase<OpTy, SingleOpMatcherOpTrait> {
+DiagnosedSilenceableFailure matchOptionalOperation(OpTy op,
+                                                   TransformResults &results,
+                                                   TransformState &state) {
+  if constexpr (std::is_same_v<
+                    typename llvm::function_traits<
+                        decltype(&OpTy::matchOperation)>::template arg_t<0>,
+                    Operation *>) {
+    return op.matchOperation(nullptr, results, state);
+  } else {
+    return op.matchOperation(std::nullopt, results, state);
+  }
+}
+} // namespace detail
+
+template <typename OpTy>
+class AtMostOneOpMatcherOpTrait
+    : public OpTrait::TraitBase<OpTy, AtMostOneOpMatcherOpTrait> {
   template <typename T>
   using has_get_operand_handle =
       decltype(std::declval<T &>().getOperandHandle());
   template <typename T>
-  using has_match_operation = decltype(std::declval<T &>().matchOperation(
+  using has_match_operation_ptr = decltype(std::declval<T &>().matchOperation(
       std::declval<Operation *>(), std::declval<TransformResults &>(),
       std::declval<TransformState &>()));
+  template <typename T>
+  using has_match_operation_optional =
+      decltype(std::declval<T &>().matchOperation(
+          std::declval<std::optional<Operation *>>(),
+          std::declval<TransformResults &>(),
+          std::declval<TransformState &>()));
 
 public:
   static LogicalResult verifyTrait(Operation *op) {
     static_assert(llvm::is_detected<has_get_operand_handle, OpTy>::value,
-                  "SingleOpMatcherOpTrait expects operation type to have the "
-                  "getOperandHandle() method");
-    static_assert(llvm::is_detected<has_match_operation, OpTy>::value,
-                  "SingleOpMatcherOpTrait expected operation type to have the "
-                  "matchOperation(Operation *, TransformResults &, "
-                  "TransformState &) method");
+                  "AtMostOneOpMatcherOpTrait/SingleOpMatcherOpTrait expects "
+                  "operation type to have the getOperandHandle() method");
+    static_assert(
+        llvm::is_detected<has_match_operation_ptr, OpTy>::value ||
+            llvm::is_detected<has_match_operation_optional, OpTy>::value,
+        "AtMostOneOpMatcherOpTrait/SingleOpMatcherOpTrait expected operation "
+        "type to have either the matchOperation(Operation *, TransformResults "
+        "&, TransformState &) or the matchOperation(std::optional<Operation*>, "
+        "TransformResults &, TransformState &) method");
 
     // This must be a dynamic assert because interface registration is dynamic.
-    assert(isa<MatchOpInterface>(op) &&
-           "SingleOpMatchOpTrait is only available on operations with "
-           "MatchOpInterface");
+    assert(
+        isa<MatchOpInterface>(op) &&
+        "AtMostOneOpMatcherOpTrait/SingleOpMatchOpTrait is only available on "
+        "operations with MatchOpInterface");
     Value operandHandle = cast<OpTy>(op).getOperandHandle();
     if (!isa<TransformHandleTypeInterface>(operandHandle.getType())) {
-      return op->emitError() << "SingleOpMatchOpTrait requires the op handle "
+      return op->emitError() << "AtMostOneOpMatcherOpTrait/"
+                                "SingleOpMatchOpTrait requires the op handle "
                                 "to be of TransformHandleTypeInterface";
     }
 
@@ -55,12 +87,15 @@ class SingleOpMatcherOpTrait
                                     TransformState &state) {
     Value operandHandle = cast<OpTy>(this->getOperation()).getOperandHandle();
     auto payload = state.getPayloadOps(operandHandle);
-    if (!llvm::hasSingleElement(payload)) {
+    if (!llvm::hasNItemsOrLess(payload, 1)) {
       return emitDefiniteFailure(this->getOperation()->getLoc())
-             << "SingleOpMatchOpTrait requires the operand handle to point to "
-                "a single payload op";
+             << "AtMostOneOpMatcherOpTrait requires the operand handle to "
+                "point to at most one payload op";
+    }
+    if (payload.empty()) {
+      return detail::matchOptionalOperation(cast<OpTy>(this->getOperation()),
+                                            results, state);
     }
-
     return cast<OpTy>(this->getOperation())
         .matchOperation(*payload.begin(), results, state);
   }
@@ -72,12 +107,32 @@ class SingleOpMatcherOpTrait
   }
 };
 
+template <typename OpTy>
+class SingleOpMatcherOpTrait : public AtMostOneOpMatcherOpTrait<OpTy> {
+
+public:
+  DiagnosedSilenceableFailure apply(TransformRewriter &rewriter,
+                                    TransformResults &results,
+                                    TransformState &state) {
+    Value operandHandle = cast<OpTy>(this->getOperation()).getOperandHandle();
+    auto payload = state.getPayloadOps(operandHandle);
+    if (!llvm::hasSingleElement(payload)) {
+      return emitDefiniteFailure(this->getOperation()->getLoc())
+             << "SingleOpMatchOpTrait requires the operand handle to point to "
+                "a single payload op";
+    }
+    return static_cast<AtMostOneOpMatcherOpTrait<OpTy> *>(this)->apply(
+        rewriter, results, state);
+  }
+};
+
 template <typename OpTy>
 class SingleValueMatcherOpTrait
     : public OpTrait::TraitBase<OpTy, SingleValueMatcherOpTrait> {
 public:
   static LogicalResult verifyTrait(Operation *op) {
-    // This must be a dynamic assert because interface registration is dynamic.
+    // This must be a dynamic assert because interface registration is
+    // dynamic.
     assert(isa<MatchOpInterface>(op) &&
            "SingleValueMatchOpTrait is only available on operations with "
            "MatchOpInterface");
@@ -98,8 +153,8 @@ class SingleValueMatcherOpTrait
     auto payload = state.getPayloadValues(operandHandle);
     if (!llvm::hasSingleElement(payload)) {
       return emitDefiniteFailure(this->getOperation()->getLoc())
-             << "SingleValueMatchOpTrait requires the value handle to point to "
-                "a single payload value";
+             << "SingleValueMatchOpTrait requires the value handle to point "
+                "to a single payload value";
     }
 
     return cast<OpTy>(this->getOperation())
diff --git a/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.td b/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.td
index 1f81fd5252eb45..be92e4d91b42b3 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.td
@@ -14,11 +14,28 @@ def MatchOpInterface
   let cppNamespace = "::mlir::transform";
 }
 
+// Trait for "matcher" transform operations that apply to an operation handle
+// associated with at most one payload operation. Checks that it is indeed
+// the case and produces a definite failure when it is not. The matching logic
+// is implemented in the `matchOperation` function instead of `apply`. The op
+// with this trait must provide a `Value getOperandHandle()` function that
+// returns the handle to be used for matching.
+def AtMostOneOpMatcher : NativeOpTrait<"AtMostOneOpMatcherOpTrait"> {
+  let cppNamespace = "::mlir::transform";
+
+  string extraDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure matchOperation(
+          ::std::optional<::mlir::Operation *> maybeCurrent,
+          ::mlir::transform::TransformResults &results,
+          ::mlir::transform::TransformState &state);
+  }];
+}
+
 // Trait for "matcher" transform operations that apply to an operation handle
 // associated with exactly one payload operation. Checks that it is indeed
 // the case and produces a definite failure when it is not. The matching logic
 // is implemented in the `matchOperation` function instead of `apply`. The op
-// with this trait must provide a `Value getOperandHandle()` function that 
+// with this trait must provide a `Value getOperandHandle()` function that
 // returns the handle to be used for matching.
 def SingleOpMatcher : NativeOpTrait<"SingleOpMatcherOpTrait"> {
   let cppNamespace = "::mlir::transform";
@@ -35,7 +52,7 @@ def SingleOpMatcher : NativeOpTrait<"SingleOpMatcherOpTrait"> {
 // associated with exactly one payload value. Checks that it is indeed
 // the case and produces a definite failure when it is not. The matching logic
 // is implemented in the `matchValue` function instead of `apply`. The op
-// with this trait must provide a `Value getOperandHandle()` function that 
+// with this trait must provide a `Value getOperandHandle()` function that
 // returns the handle to be used for matching.
 def SingleValueMatcher : NativeOpTrait<"SingleValueMatcherOpTrait"> {
   let cppNamespace = "::mlir::transform";
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
index 3448e27a41a680..f28205a2550702 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
@@ -20,107 +20,118 @@ def Transform_Dialect : Dialect {
 
   let hasOperationAttrVerify = 1;
   let extraClassDeclaration = [{
-      /// Name of the attribute attachable to the symbol table operation
-      /// containing named sequences. This is used to trigger verification.
-      constexpr const static ::llvm::StringLiteral
-          kWithNamedSequenceAttrName = "transform.with_named_sequence";
-
-      /// Name of the attribute attachable to an operation so it can be
-      /// identified as root by the default interpreter pass.
-      constexpr const static ::llvm::StringLiteral
-          kTargetTagAttrName = "transform.target_tag";
-
-      /// Name of the attribute attachable to an operation, indicating that
-      /// TrackingListener failures should be silenced.
-      constexpr const static ::llvm::StringLiteral
-          kSilenceTrackingFailuresAttrName = "transform.silence_tracking_failures";
-
-      /// Names of the attributes indicating whether an argument of an external
-      /// transform dialect symbol is consumed or only read.
-      constexpr const static ::llvm::StringLiteral
-          kArgConsumedAttrName = "transform.consumed";
-      constexpr const static ::llvm::StringLiteral
-          kArgReadOnlyAttrName = "transform.readonly";
-
-      template <typename DataTy>
-      const DataTy &getExtraData() const {
-        return *static_cast<const DataTy *>(extraData.at(::mlir::TypeID::get<DataTy>()).get());
-      }
-
-      /// Parses a type registered by this dialect or one of its extensions.
-      ::mlir::Type parseType(::mlir::DialectAsmParser &parser) const override;
-
-      /// Prints a type registered by this dialect or one of its extensions.
-      void printType(::mlir::Type type,
-                     ::mlir::DialectAsmPrinter &printer) const override;
-
-      /// Parser callback for an individual type registered by this dialect or
-      /// its extensions.
-      using ExtensionTypeParsingHook = ::mlir::Type (*)(::mlir::AsmParser &);
-
-      /// Printer callback for an individual type registered by this dialect or
-      /// its extensions.
-      using ExtensionTypePrintingHook =
-          std::function<void (::mlir::Type, ::mlir::AsmPrinter &)>;
-
-    private:
-      /// Registers operations specified as template parameters with this
-      /// dialect. Checks that they implement the required interfaces.
-      template <typename... OpTys>
-      void addOperationsChecked() {
-        (addOperationIfNotRegistered<OpTys>(), ...);
-      }
-      template <typename OpTy>
-      void addOperationIfNotRegistered();
-
-      /// Reports a repeated registration error of an op with the given name.
-      [[noreturn]] void reportDuplicateOpRegistration(StringRef opName);
-
-      /// Registers the types specified as template parameters with the
-      /// Transform dialect. Checks that they meet the requirements for
-      /// Transform IR types.
-      template <typename... TypeTys>
-      void addTypesChecked() {
-        (addTypeIfNotRegistered<TypeTys>(), ...);
-      }
-      template <typename Type>
-      void addTypeIfNotRegistered();
-
-      /// Reports a repeated registration error of a type with the given
-      /// mnemonic.
-      [[noreturn]] void reportDuplicateTypeRegistration(StringRef mnemonic);
-
-      /// Registers dialect types with the context.
-      void initializeTypes();
-
-      // Give extensions access to injection functions.
-      template <typename, typename...>
-      friend class TransformDialectExtension;
-
-      /// Gets a mutable reference to extra data of the kind specified as
-      /// template argument. Allocates the data on the first call.
-      template <typename DataTy>
-      DataTy &getOrCreateExtraData();
-
-      //===----------------------------------------------------------------===//
-      // Data fields
-      //===----------------------------------------------------------------===//
-
-      /// Additional data associated with and owned by the dialect. Accessible
-      /// to extensions.
-      ::llvm::DenseMap<::mlir::TypeID, std::unique_ptr<
-            ::mlir::transform::detail::TransformDialectDataBase>>
-          extraData;
-
-      /// A map from type mnemonic to its parsing function for the remainder of
-      /// the syntax. The parser has access to the mnemonic, so it is used for
-      /// further dispatch.
-      ::llvm::StringMap<ExtensionTypeParsingHook> typeParsingHooks;
-
-      /// A map from type TypeID to its printing function. No need to do string
-      /// lookups when the type is fully constructed.
-      ::llvm::DenseMap<::mlir::TypeID, ExtensionTypePrintingHook>
-      typePrintingHooks;
+    /// Name of the attribute attachable to the symbol table operation
+    /// containing named sequences. This is used to trigger verification.
+    constexpr const static ::llvm::StringLiteral kWithNamedSequenceAttrName =
+        "transform.with_named_sequence";
+
+    /// Name of the attribute attachable to an operation so it can be
+    /// identified as root by the default interpreter pass.
+    constexpr const static ::llvm::StringLiteral kTargetTagAttrName =
+        "transform.target_tag";
+
+    /// Name of the attribute attachable to an operation, indicating that
+    /// TrackingListener failures should be silenced.
+    constexpr const static ::llvm::StringLiteral
+        kSilenceTrackingFailuresAttrName =
+            "transform.silence_tracking_failures";
+
+    /// Names of the attributes indicating whether an argument of an external
+    /// transform dialect symbol is consumed or only read.
+    constexpr const static ::llvm::StringLiteral kArgConsumedAttrName =
+        "transform.consumed";
+    constexpr const static ::llvm::StringLiteral kArgReadOnlyAttrName =
+        "transform.readonly";
+
+    /// Names of the attributes indicating whether an argument of an external
+    /// transform dialect symbol is consumed or only read.
+    StringAttr getConsumedAttrName() const {
+      return StringAttr::get(getContext(), kArgConsumedAttrName);
+    }
+    StringAttr getReadOnlyAttrName() const {
+      return StringAttr::get(getContext(), kArgReadOnlyAttrName);
+    }
+
+    template <typename DataTy>
+    const DataTy &getExtraData() const {
+      return *static_cast<const DataTy *>(
+          extraData.at(::mlir::TypeID::get<DataTy>()).get());
+    }
+
+    /// Parses a type registered by this dialect or one of its extensions.
+    ::mlir::Type parseType(::mlir::DialectAsmParser & parser) const override;
+
+    /// Prints a type registered by this dialect or one of its extensions.
+    void printType(::mlir::Type type, ::mlir::DialectAsmPrinter & printer)
+        const override;
+
+    /// Parser callback for an individual type registered by this dialect or
+    /// its extensions.
+    using ExtensionTypeParsingHook = ::mlir::Type (*)(::mlir::AsmParser &);
+
+    /// Printer callback for an individual type registered by this dialect or
+    /// its extensions.
+    using ExtensionTypePrintingHook =
+        std::function<void(::mlir::Type, ::mlir::AsmPrinter &)>;
+
+  private:
+    /// Registers operations specified as template parameters with this
+    /// dialect. Checks that they implement the required interfaces.
+    template <typename... OpTys>
+    void addOperationsChecked() {
+      (addOperationIfNotRegistered<OpTys>(), ...);
+    }
+    template <typename OpTy>
+    void addOperationIfNotRegistered();
+
+    /// Reports a repeated registration error of an op with the given name.
+    [[noreturn]] void reportDuplicateOpRegistration(StringRef opName);
+
+    /// Registers types specified as template parameters with the Transform
+    /// dialect. Checks that they meet the requirements for Transform IR types.
+    template <typename... TypeTys>
+    void addTypesChecked() {
+      (addTypeIfNotRegistered<TypeTys>(), ...);
+    }
+    template <typename Type>
+    void addTypeIfNotRegistered();
+
+    /// Reports a repeated registration error of a type with the given
+    /// mnemonic.
+    [[noreturn]] void reportDuplicateTypeRegistration(StringRef mnemonic);
+
+    /// Registers dialect types with the context.
+    void initializeTypes();
+
+    // Give extensions access to injection functions.
+    template <typename, typename...>
+    friend class TransformDialectExtension;
+
+    /// Gets a mutable reference to extra data of the kind specified as
+    /// template argument. Allocates the data on the first call.
+    template <typename DataTy>
+    DataTy &getOrCreateExtraData();
+
+    //===----------------------------------------------------------------===//
+    // Data fields
+    //===----------------------------------------------------------------===//
+
+    /// Additional data associated with and owned by the dialect. Accessible
+    /// to extensions.
+    ::llvm::DenseMap<
+        ::mlir::TypeID,
+        std::unique_ptr<::mlir::transform::detail::TransformDialectDataBase>>
+        extraData;
+
+    /// A map from type mnemonic to its parsing function for the remainder of
+    /// the syntax. The parser has access to the mnemonic, so it is used for
+    /// further dispatch.
+    ::llvm::StringMap<ExtensionTypeParsingHook> typeParsingHooks;
+
+    /// A map from type TypeID to its printing function. No need to do string
+    /// lookups when the type is fully constructed.
+    ::llvm::DenseMap<::mlir::TypeID, ExtensionTypePrintingHook>
+        typePrintingHooks;
   }];
 }
 
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index ca5c915ef8c2ca..5bc92e8e954eae 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -595,8 +595,9 @@ def GetDefiningOp : TransformDialectOp<"get_defining_op",
 
 def GetParentOp : TransformDialectOp<"get_parent_op",
     [DeclareOpInterfaceMethods<TransformOpInterface>,
+     MatchOpInterface,
      NavigationTransformOpTrait, MemoryEffectsOpInterface]> {
-  let summary = "Gets handles to the closest isolated-from-above parents";
+  let summary = "Gets handles to the closest parent ops";
   let description = [{
     The handle defined by this Transform op corresponds to the parents of the
     targeted payload ops (in the same order).
@@ -605,6 +606,9 @@ def GetParentOp : TransformDialectOp<"get_parent_op",
     that case for each target op, the closest parent op that fulfills all
     requirements, is returned.
     - `isolated_from_above`: the parent op must be isolated from above
+    - `allow_empty_results`: get_parent_op is allowed to return an empty list and
+      still succeeds. In such a case, if get_parent_op fails for any operation
+      in the list, the entire transform returns an empty handle.
     - `op_name`: the parent op must have the specified name
 
     If `deduplicate` is set, the result handle does not contain any duplicate
@@ -614,12 +618,14 @@ def GetParentOp : TransformDialectOp<"get_parent_op",
     is applied, e.g., "B" may itself be a parent of "A". This may have an impact
     on the further transformation applied to the handle produced here.
 
-    If any of the given Payload IR ops has no such suitable parent, the
-    transformation fails silently.
+    If any of the given Payload IR ops has no such suitable parent, then:
+      - if `allow_empty_results` is set, the result handle is empty
+      - otherwise, the transformation produces a silenceable failure.
   }];
 
   let arguments = (ins TransformHandleTypeInterface:$target,
                        UnitAttr:$isolated_from_above,
+                       UnitAttr:$allow_empty_results,
                        OptionalAttr<StrAttr>:$op_name,
                        UnitAttr:$deduplicate);
   let results = (outs TransformHandleTypeInterface:$parent);
@@ -739,6 +745,21 @@ def IncludeOp : TransformDialectOp<"include",
   }];
 }
 
+def MatchOperationEmptyOp : Op<Transform_Dialect, "match.operation_empty", [
+    AtMostOneOpMatcher,
+    MatchOpInterface,
+    MemoryEffectsOpInterface]> {
+  let summary =
+    "Matches if the handle is not associated to any op";
+  let description = [{
+    Succeeds if the handle is not associated to any op.
+  }];
+  let arguments = (ins TransformHandleTypeInterface:$operand_handle);
+  let assemblyFormat =
+      "$operand_handle attr-dict `:` type($operand_handle)";
+  let extraClassDeclaration = AtMostOneOpMatcher.extraDeclaration;
+}
+
 def MatchOperationNameOp : TransformDialectOp<"match.operation_name",
     [SingleOpMatcher,
      MatchOpInterface,
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h
index 91903e254b0d5b..a6f0dddebd7eac 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h
@@ -62,9 +62,11 @@ LogicalResult interpreterBaseRunOnOperationImpl(
 ///     transform script. If empty, `debugTransformRootTag` is considered or the
 ///     pass root operation must contain a single top-level transform op that
 ///     will be interpreted.
-///   - transformLibraryFileName: if non-empty, the name of the file containing
-///     definitions of external symbols referenced in the transform script.
-///     These definitions will be used to replace declarations.
+///   - transformLibraryFileName: if non-empty, the module in this file will be
+///     merged into the main transform script run by the interpreter before
+///     execution. This allows to provide definitions for external functions
+///     used in the main script. Other public symbols in the library module may
+///     lead to collisions with public symbols in the main script.
 ///   - debugPayloadRootTag: if non-empty, the value of the attribute named
 ///     `kTransformDialectTagAttrName` indicating the single op that is
 ///     considered the payload root of the transform interpreter; otherwise, the
@@ -85,7 +87,7 @@ LogicalResult interpreterBaseRunOnOperationImpl(
 /// as template arguments. They are *not* expected to to implement `initialize`
 /// or `runOnOperation`. They *are* expected to call the copy constructor of
 /// this class in their copy constructors, short of which the file-based
-/// transform dialect script injection facility will become nonoperational.
+/// transform dialect script injection facility will become non-operational.
 ///
 /// Concrete passes may implement the `runBeforeInterpreter` and
 /// `runAfterInterpreter` to customize the behavior of the pass.
@@ -98,6 +100,7 @@ class TransformInterpreterPassBase : public GeneratedBase<Concrete> {
 
   TransformInterpreterPassBase(const TransformInterpreterPassBase &pass) {
     sharedTransformModule = pass.sharedTransformModule;
+    transformLibraryModule = pass.transformLibraryModule;
     options = pass.options;
   }
 
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/COO.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/COO.h
index 40c68177bbea82..f6eb45defcc1cf 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/COO.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/COO.h
@@ -23,20 +23,19 @@ namespace mlir {
 namespace sparse_tensor {
 
 /// An element of a sparse tensor in coordinate-scheme representation
-/// (i.e., a pair of coordinates and value).  For example, a rank-1
+/// (i.e., a pair of coordinates and value). For example, a rank-1
 /// vector element would look like
 ///   ({i}, a[i])
 /// and a rank-5 tensor element would look like
 ///   ({i,j,k,l,m}, a[i,j,k,l,m])
 ///
-/// The coordinates are represented as a (non-owning) pointer into
-/// a shared pool of coordinates, rather than being stored directly in
-/// this object.  This significantly improves performance because it:
-/// (1) reduces the per-element memory footprint, and (2) centralizes
-/// the memory management for coordinates.  The only downside is that
-/// the coordinates themselves cannot be retrieved without knowing the
-/// rank of the tensor to which this element belongs (and that rank is
-/// not stored in this object).
+/// The coordinates are represented as a (non-owning) pointer into a
+/// shared pool of coordinates, rather than being stored directly in this
+/// object. This significantly improves performance because it reduces the
+/// per-element memory footprint and centralizes the memory management for
+/// coordinates. The only downside is that the coordinates themselves cannot
+/// be retrieved without knowing the rank of the tensor to which this element
+/// belongs (and that rank is not stored in this object).
 template <typename V>
 struct Element final {
   Element(const uint64_t *coords, V val) : coords(coords), value(val){};
@@ -48,10 +47,6 @@ struct Element final {
 template <typename V>
 struct ElementLT final {
   ElementLT(uint64_t rank) : rank(rank) {}
-
-  /// Compares two elements a la `operator<`.
-  ///
-  /// Precondition: the elements must both be valid for `rank`.
   bool operator()(const Element<V> &e1, const Element<V> &e2) const {
     for (uint64_t d = 0; d < rank; ++d) {
       if (e1.coords[d] == e2.coords[d])
@@ -60,13 +55,10 @@ struct ElementLT final {
     }
     return false;
   }
-
   const uint64_t rank;
 };
 
-/// The type of callback functions which receive an element.  We avoid
-/// packaging the coordinates and value together as an `Element` object
-/// because this helps keep code somewhat cleaner.
+/// The type of callback functions which receive an element.
 template <typename V>
 using ElementConsumer =
     const std::function<void(const std::vector<uint64_t> &, V)> &;
@@ -79,37 +71,17 @@ using ElementConsumer =
 template <typename V>
 class SparseTensorCOO final {
 public:
-  using value_type = const Element<V>;
-  using reference = value_type &;
-  using const_reference = reference;
-  using vector_type = std::vector<Element<V>>;
-  using iterator = typename vector_type::const_iterator;
-  using const_iterator = iterator;
-  using difference_type = typename vector_type::difference_type;
-  using size_type = typename vector_type::size_type;
+  using const_iterator = typename std::vector<Element<V>>::const_iterator;
 
   /// Constructs a new coordinate-scheme sparse tensor with the given
-  /// sizes and initial storage capacity.
-  ///
-  /// Asserts:
-  /// * `dimSizes` has nonzero size.
-  /// * the elements of `dimSizes` are nonzero.
+  /// sizes and an optional initial storage capacity.
   explicit SparseTensorCOO(const std::vector<uint64_t> &dimSizes,
                            uint64_t capacity = 0)
       : SparseTensorCOO(dimSizes.size(), dimSizes.data(), capacity) {}
 
-  // TODO: make a class for capturing known-valid sizes (a la PermutationRef),
-  // so that `SparseTensorStorage::toCOO` can avoid redoing these assertions.
-  // Also so that we can enforce the asserts *before* copying into `dimSizes`.
-  //
   /// Constructs a new coordinate-scheme sparse tensor with the given
-  /// sizes and initial storage capacity.
-  ///
-  /// Precondition: `dimSizes` must be valid for `dimRank`.
-  ///
-  /// Asserts:
-  /// * `dimRank` is nonzero.
-  /// * the elements of `dimSizes` are nonzero.
+  /// sizes and an optional initial storage capacity. The size of the
+  /// dimSizes array is determined by dimRank.
   explicit SparseTensorCOO(uint64_t dimRank, const uint64_t *dimSizes,
                            uint64_t capacity = 0)
       : dimSizes(dimSizes, dimSizes + dimRank), isSorted(true) {
@@ -134,16 +106,7 @@ class SparseTensorCOO final {
   /// Returns the `operator<` closure object for the COO's element type.
   ElementLT<V> getElementLT() const { return ElementLT<V>(getRank()); }
 
-  /// Adds an element to the tensor.  This method does not check whether
-  /// `dimCoords` is already associated with a value, it adds it regardless.
-  /// Resolving such conflicts is left up to clients of the iterator
-  /// interface.
-  ///
-  /// This method invalidates all iterators.
-  ///
-  /// Asserts:
-  /// * the `dimCoords` is valid for `getRank`.
-  /// * the components of `dimCoords` are valid for `getDimSizes`.
+  /// Adds an element to the tensor. This method invalidates all iterators.
   void add(const std::vector<uint64_t> &dimCoords, V val) {
     const uint64_t *base = coordinates.data();
     const uint64_t size = coordinates.size();
@@ -154,7 +117,7 @@ class SparseTensorCOO final {
              "Coordinate is too large for the dimension");
       coordinates.push_back(dimCoords[d]);
     }
-    // This base only changes if `coordinates` was reallocated.  In which
+    // This base only changes if `coordinates` was reallocated. In which
     // case, we need to correct all previous pointers into the vector.
     // Note that this only happens if we did not set the initial capacity
     // right, and then only for every internal vector reallocation (which
@@ -175,11 +138,9 @@ class SparseTensorCOO final {
   const_iterator begin() const { return elements.cbegin(); }
   const_iterator end() const { return elements.cend(); }
 
-  /// Sorts elements lexicographically by coordinates.  If a coordinate
+  /// Sorts elements lexicographically by coordinates. If a coordinate
   /// is mapped to multiple values, then the relative order of those
-  /// values is unspecified.
-  ///
-  /// This method invalidates all iterators.
+  /// values is unspecified. This method invalidates all iterators.
   void sort() {
     if (isSorted)
       return;
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
index 1b6736b0cdd1d1..78c1a0544e3a52 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -1,4 +1,4 @@
-//===- File.h - Parsing sparse tensors from files ---------------*- C++ -*-===//
+//===- File.h - Reading/writing sparse tensors from/to files ----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements parsing and printing of files in one of the
+// This file implements reading and writing sparse tensor files in one of the
 // following external formats:
 //
 // (1) Matrix Market Exchange (MME): *.mtx
@@ -197,31 +197,14 @@ class SparseTensorReader final {
 
   /// Allocates a new COO object for `lvlSizes`, initializes it by reading
   /// all the elements from the file and applying `dim2lvl` to their
-  /// dim-coordinates, and then closes the file.
-  ///
-  /// Preconditions:
-  /// * `lvlSizes` must be valid for `lvlRank`.
-  /// * `dim2lvl` must be valid for `getRank()`.
-  /// * `dim2lvl` maps `getDimSizes()`-coordinates to `lvlSizes`-coordinates.
-  /// * the file's actual value type can be read as `V`.
-  ///
-  /// Asserts:
-  /// * `isValid()`
-  /// * `dim2lvl` is a permutation, and therefore also `lvlRank == getRank()`.
-  ///   (This requirement will be lifted once we functionalize `dim2lvl`.)
-  //
-  // NOTE: This method is factored out of `readSparseTensor` primarily to
-  // reduce code bloat (since the bulk of the code doesn't care about the
-  // `<P,I>` type template parameters).  But we leave it public since it's
-  // perfectly reasonable for clients to use.
+  /// dim-coordinates, and then closes the file. Templated on V only.
   template <typename V>
   SparseTensorCOO<V> *readCOO(uint64_t lvlRank, const uint64_t *lvlSizes,
                               const uint64_t *dim2lvl);
 
   /// Allocates a new sparse-tensor storage object with the given encoding,
   /// initializes it by reading all the elements from the file, and then
-  /// closes the file.  Preconditions/assertions are as per `readCOO`
-  /// and `SparseTensorStorage::newFromCOO`.
+  /// closes the file. Templated on P, I, and V.
   template <typename P, typename I, typename V>
   SparseTensorStorage<P, I, V> *
   readSparseTensor(uint64_t lvlRank, const uint64_t *lvlSizes,
@@ -312,10 +295,6 @@ SparseTensorCOO<V> *SparseTensorReader::readCOO(uint64_t lvlRank,
                                                 const uint64_t *lvlSizes,
                                                 const uint64_t *dim2lvl) {
   assert(isValid() && "Attempt to readCOO() before readHeader()");
-  // Construct a `PermutationRef` for the `pushforward` below.
-  // TODO: This specific implementation does not generalize to arbitrary
-  // mappings, but once we functionalize the `dim2lvl` argument we can
-  // simply use that function instead.
   const uint64_t dimRank = getRank();
   assert(lvlRank == dimRank && "Rank mismatch");
   detail::PermutationRef d2l(dimRank, dim2lvl);
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index 68dcab6e64c7e4..28c28c28109c3c 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -25,9 +25,30 @@
 #include "mlir/ExecutionEngine/SparseTensor/COO.h"
 #include "mlir/ExecutionEngine/SparseTensor/ErrorHandling.h"
 
+#define ASSERT_VALID_DIM(d)                                                    \
+  assert(d < getDimRank() && "Dimension is out of bounds");
+#define ASSERT_VALID_LVL(l)                                                    \
+  assert(l < getLvlRank() && "Level is out of bounds");
+#define ASSERT_COMPRESSED_LVL(l)                                               \
+  assert(isCompressedLvl(l) && "Level is not compressed");
+#define ASSERT_COMPRESSED_OR_SINGLETON_LVL(l)                                  \
+  do {                                                                         \
+    const DimLevelType dlt = getLvlType(l);                                    \
+    (void)dlt;                                                                 \
+    assert((isCompressedDLT(dlt) || isSingletonDLT(dlt)) &&                    \
+           "Level is neither compressed nor singleton");                       \
+  } while (false)
+#define ASSERT_DENSE_DLT(dlt) assert(isDenseDLT(dlt) && "Level is not dense");
+
 namespace mlir {
 namespace sparse_tensor {
 
+// Forward references.
+template <typename V>
+class SparseTensorEnumeratorBase;
+template <typename P, typename C, typename V>
+class SparseTensorEnumerator;
+
 namespace detail {
 
 /// Checks whether the `perm` array is a permutation of `[0 .. size)`.
@@ -125,33 +146,6 @@ class PermutationRef final {
 
 } // namespace detail
 
-//===----------------------------------------------------------------------===//
-// This forward decl is sufficient to split `SparseTensorStorageBase` into
-// its own header, but isn't sufficient for `SparseTensorStorage` to join it.
-template <typename V>
-class SparseTensorEnumeratorBase;
-
-// These macros ensure consistent error messages, without risk of incuring
-// an additional method call to do so.
-#define ASSERT_VALID_DIM(d)                                                    \
-  assert(d < getDimRank() && "Dimension is out of bounds");
-#define ASSERT_VALID_LVL(l)                                                    \
-  assert(l < getLvlRank() && "Level is out of bounds");
-#define ASSERT_COMPRESSED_LVL(l)                                               \
-  assert(isCompressedLvl(l) && "Level is not compressed");
-#define ASSERT_COMPRESSED_OR_SINGLETON_LVL(l)                                  \
-  do {                                                                         \
-    const DimLevelType dlt = getLvlType(l);                                    \
-    (void)dlt;                                                                 \
-    assert((isCompressedDLT(dlt) || isSingletonDLT(dlt)) &&                    \
-           "Level is neither compressed nor singleton");                       \
-  } while (false)
-// Because the `SparseTensorStorageBase` ctor uses `MLIR_SPARSETENSOR_FATAL`
-// (rather than `assert`) when validating level-types, all the uses of
-// `ASSERT_DENSE_DLT` are technically unnecessary.  However, they are
-// retained for the sake of future-proofing.
-#define ASSERT_DENSE_DLT(dlt) assert(isDenseDLT(dlt) && "Level is not dense");
-
 /// Abstract base class for `SparseTensorStorage<P,C,V>`.  This class
 /// takes responsibility for all the `<P,C,V>`-independent aspects
 /// of the tensor (e.g., shape, sparsity, permutation).  In addition,
@@ -185,23 +179,9 @@ class SparseTensorEnumeratorBase;
 /// specified.  Thus, dynamic cardinalities always have an "immutable but
 /// unknown" value; so the term "dynamic" should not be taken to indicate
 /// run-time mutability.
-//
-// TODO: we'd like to factor out a class akin to `PermutationRef` for
-// capturing known-valid sizes to avoid redundant validity assertions.
-// But calling that class "SizesRef" would be a terrible name (and
-// "ValidSizesRef" isn't much better).  Whereas, calling it "ShapeRef"
-// would be a lot nicer, but then that conflicts with the terminology
-// introduced above.  So we need to come up with some new terminology
-// for distinguishing things, which allows a reasonable class name too.
 class SparseTensorStorageBase {
 protected:
-  // Since this class is virtual, we must disallow public copying in
-  // order to avoid "slicing".  Since this class has data members,
-  // that means making copying protected.
-  // <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rc-copy-virtual>
   SparseTensorStorageBase(const SparseTensorStorageBase &) = default;
-  // Copy-assignment would be implicitly deleted (because our fields
-  // are const), so we explicitly delete it for clarity.
   SparseTensorStorageBase &operator=(const SparseTensorStorageBase &) = delete;
 
 public:
@@ -313,10 +293,8 @@ class SparseTensorStorageBase {
   MLIR_SPARSETENSOR_FOREVERY_V(DECL_GETVALUES)
 #undef DECL_GETVALUES
 
-  /// Element-wise insertion in lexicographic coordinate order.  The first
+  /// Element-wise insertion in lexicographic coordinate order. The first
   /// argument is the level-coordinates for the value being inserted.
-  // TODO: For better safety, this should take a parameter for the
-  // length of `lvlCoords` and check that against `getLvlRank()`.
 #define DECL_LEXINSERT(VNAME, V) virtual void lexInsert(const uint64_t *, V);
   MLIR_SPARSETENSOR_FOREVERY_V(DECL_LEXINSERT)
 #undef DECL_LEXINSERT
@@ -348,12 +326,6 @@ class SparseTensorStorageBase {
   const std::vector<uint64_t> lvl2dim;
 };
 
-//===----------------------------------------------------------------------===//
-// This forward decl is necessary for defining `SparseTensorStorage`,
-// but isn't sufficient for splitting it off.
-template <typename P, typename C, typename V>
-class SparseTensorEnumerator;
-
 /// A memory-resident sparse tensor using a storage scheme based on
 /// per-level sparse/dense annotations.  This data structure provides
 /// a bufferized form of a sparse tensor type.  In contrast to generating
@@ -612,7 +584,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
         [&coo](const auto &trgCoords, V val) { coo->add(trgCoords, val); });
     // TODO: This assertion assumes there are no stored zeros,
     // or if there are then that we don't filter them out.
-    // Cf., <https://github.com/llvm/llvm-project/issues/54179>
+    // <https://github.com/llvm/llvm-project/issues/54179>
     assert(coo->getElements().size() == values.size());
     return coo;
   }
@@ -783,8 +755,11 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     for (uint64_t l = 0; l < lvlRank; ++l) {
       const auto crd = lvlCoords[l];
       const auto cur = lvlCursor[l];
-      if (crd > cur || (crd == cur && !isUniqueLvl(l)))
+      if (crd > cur || (crd == cur && !isUniqueLvl(l)) ||
+          (crd < cur && !isOrderedLvl(l))) {
         return l;
+      }
+
       if (crd < cur) {
         assert(false && "non-lexicographic insertion");
         return -1u;
@@ -900,8 +875,7 @@ class SparseTensorEnumeratorBase {
 
 //===----------------------------------------------------------------------===//
 template <typename P, typename C, typename V>
-class SparseTensorEnumerator final
-    : public SparseTensorEnumeratorBase<V> {
+class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
   using Base = SparseTensorEnumeratorBase<V>;
   using StorageImpl = SparseTensorStorage<P, C, V>;
 
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
index 14f89bee05bf55..8f320f04f23fc8 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This header file provides the enums and functions which comprise the
-// public API of the `ExecutionEngine/SparseTensorRuntime.cpp` runtime
-// support library for the SparseTensor dialect.
+// This header file provides the functions which comprise the public API of the
+// sparse tensor runtime support library for the SparseTensor dialect.
 //
 //===----------------------------------------------------------------------===//
 
@@ -153,8 +152,7 @@ MLIR_SPARSETENSOR_FOREVERY_V(DECL_GETNEXT)
 #undef DECL_GETNEXT
 
 /// Reads the sparse tensor, stores the coordinates and values to the given
-/// memrefs. Returns a boolean value to indicate whether the COO elements are
-/// sorted.
+/// memrefs. Returns a boolean to indicate whether the COO elements are sorted.
 #define DECL_GETNEXT(VNAME, V, CNAME, C)                                       \
   MLIR_CRUNNERUTILS_EXPORT bool                                                \
       _mlir_ciface_getSparseTensorReaderReadToBuffers##CNAME##VNAME(           \
@@ -240,8 +238,7 @@ MLIR_CRUNNERUTILS_EXPORT index_type getSparseTensorReaderNSE(void *p);
 MLIR_CRUNNERUTILS_EXPORT index_type getSparseTensorReaderDimSize(void *p,
                                                                  index_type d);
 
-/// Releases the SparseTensorReader. This also closes the file associated with
-/// the reader.
+/// Releases the SparseTensorReader and closes the associated file.
 MLIR_CRUNNERUTILS_EXPORT void delSparseTensorReader(void *p);
 
 /// Creates a SparseTensorWriter for outputting a sparse tensor to a file
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 3eca8bd12f4336..5e54d4ea49e825 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -477,8 +477,8 @@ class OpBuilder : public Builder {
     if (LLVM_UNLIKELY(!opName)) {
       llvm::report_fatal_error(
           "Building op `" + OpT::getOperationName() +
-          "` but it isn't registered in this MLIRContext: the dialect may not "
-          "be loaded or this operation isn't registered by the dialect. See "
+          "` but it isn't known in this MLIRContext: the dialect may not "
+          "be loaded or this operation hasn't been added by the dialect. See "
           "also https://mlir.llvm.org/getting_started/Faq/"
           "#registered-loaded-dependent-whats-up-with-dialects-management");
     }
diff --git a/mlir/include/mlir/IR/StorageUniquerSupport.h b/mlir/include/mlir/IR/StorageUniquerSupport.h
index c466e230d341d3..982d5220ab52ce 100644
--- a/mlir/include/mlir/IR/StorageUniquerSupport.h
+++ b/mlir/include/mlir/IR/StorageUniquerSupport.h
@@ -175,11 +175,11 @@ class StorageUserBase : public BaseT, public Traits<ConcreteT>... {
   /// function is guaranteed to return a non null object and will assert if
   /// the arguments provided are invalid.
   template <typename... Args>
-  static ConcreteT get(MLIRContext *ctx, Args... args) {
+  static ConcreteT get(MLIRContext *ctx, Args &&...args) {
     // Ensure that the invariants are correct for construction.
     assert(
         succeeded(ConcreteT::verify(getDefaultDiagnosticEmitFn(ctx), args...)));
-    return UniquerT::template get<ConcreteT>(ctx, args...);
+    return UniquerT::template get<ConcreteT>(ctx, std::forward<Args>(args)...);
   }
 
   /// Get or create a new ConcreteT instance within the ctx, defined at
@@ -187,8 +187,9 @@ class StorageUserBase : public BaseT, public Traits<ConcreteT>... {
   /// invalid, errors are emitted using the provided location and a null object
   /// is returned.
   template <typename... Args>
-  static ConcreteT getChecked(const Location &loc, Args... args) {
-    return ConcreteT::getChecked(getDefaultDiagnosticEmitFn(loc), args...);
+  static ConcreteT getChecked(const Location &loc, Args &&...args) {
+    return ConcreteT::getChecked(getDefaultDiagnosticEmitFn(loc),
+                                 std::forward<Args>(args)...);
   }
 
   /// Get or create a new ConcreteT instance within the ctx. If the arguments
diff --git a/mlir/include/mlir/IR/SymbolTable.h b/mlir/include/mlir/IR/SymbolTable.h
index 33427788a075ed..7f21f22eba951e 100644
--- a/mlir/include/mlir/IR/SymbolTable.h
+++ b/mlir/include/mlir/IR/SymbolTable.h
@@ -55,6 +55,23 @@ class SymbolTable {
   /// after insertion as attribute.
   StringAttr insert(Operation *symbol, Block::iterator insertPt = {});
 
+  /// Renames the given op or the op refered to by the given name to the given
+  /// new name and updates the symbol table and all usages of the symbol
+  /// accordingly. Fails if the updating of the usages fails.
+  LogicalResult rename(StringAttr from, StringAttr to);
+  LogicalResult rename(Operation *op, StringAttr to);
+  LogicalResult rename(StringAttr from, StringRef to);
+  LogicalResult rename(Operation *op, StringRef to);
+
+  /// Renames the given op or the op refered to by the given name to the a name
+  /// that is unique within this and the provided other symbol tables and
+  /// updates the symbol table and all usages of the symbol accordingly. Returns
+  /// the new name or failure if the renaming fails.
+  FailureOr<StringAttr> renameToUnique(StringAttr from,
+                                       ArrayRef<SymbolTable *> others);
+  FailureOr<StringAttr> renameToUnique(Operation *op,
+                                       ArrayRef<SymbolTable *> others);
+
   /// Return the name of the attribute used for symbol names.
   static StringRef getSymbolAttrName() { return "sym_name"; }
 
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index 51d4e366e4970d..4e550fe3e3a60e 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -268,6 +268,9 @@ class OpOperand : public IROperand<OpOperand, Value> {
   /// Return which operand this is in the OpOperand list of the Operation.
   unsigned getOperandNumber();
 
+  /// Set the current value being used by this operand.
+  void assign(Value value) { set(value); }
+
 private:
   /// Keep the constructor private and accessible to the OperandStorage class
   /// only to avoid hard-to-debug typo/programming mistakes.
diff --git a/mlir/include/mlir/IR/ValueRange.h b/mlir/include/mlir/IR/ValueRange.h
index 9c11178f9cd9ca..ed69e5824f70b5 100644
--- a/mlir/include/mlir/IR/ValueRange.h
+++ b/mlir/include/mlir/IR/ValueRange.h
@@ -126,6 +126,9 @@ class MutableOperandRange {
                       ArrayRef<OperandSegment> operandSegments = std::nullopt);
   MutableOperandRange(Operation *owner);
 
+  /// Construct a new mutable range for the given OpOperand.
+  MutableOperandRange(OpOperand &opOperand);
+
   /// Slice this range into a sub range, with the additional operand segment.
   MutableOperandRange
   slice(unsigned subStart, unsigned subLen,
diff --git a/mlir/include/mlir/Support/StorageUniquer.h b/mlir/include/mlir/Support/StorageUniquer.h
index 13359bf91f40d1..2369d9cf90ff9d 100644
--- a/mlir/include/mlir/Support/StorageUniquer.h
+++ b/mlir/include/mlir/Support/StorageUniquer.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
+#include <utility>
 
 namespace mlir {
 namespace detail {
@@ -207,7 +208,7 @@ class StorageUniquer {
 
     // Generate a constructor function for the derived storage.
     auto ctorFn = [&](StorageAllocator &allocator) {
-      auto *storage = Storage::construct(allocator, derivedKey);
+      auto *storage = Storage::construct(allocator, std::move(derivedKey));
       if (initFn)
         initFn(storage);
       return storage;
@@ -300,9 +301,9 @@ class StorageUniquer {
   static typename ImplTy::KeyTy getKey(Args &&...args) {
     if constexpr (llvm::is_detected<detail::has_impltype_getkey_t, ImplTy,
                                     Args...>::value)
-      return ImplTy::getKey(args...);
+      return ImplTy::getKey(std::forward<Args>(args)...);
     else
-      return typename ImplTy::KeyTy(args...);
+      return typename ImplTy::KeyTy(std::forward<Args>(args)...);
   }
 
   //===--------------------------------------------------------------------===//
diff --git a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
index d423d37b9770c6..bbe139fd4fb69f 100644
--- a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
@@ -407,10 +407,6 @@ void DeadCodeAnalysis::visitRegionTerminator(
 
 void DeadCodeAnalysis::visitCallableTerminator(Operation *op,
                                                CallableOpInterface callable) {
-  // If there are no exiting values, we have nothing to do.
-  if (op->getNumOperands() == 0)
-    return;
-
   // Add as predecessors to all callsites this return op.
   auto *callsites = getOrCreateFor<PredecessorState>(op, callable);
   bool canResolve = op->hasTrait<OpTrait::ReturnLike>();
diff --git a/mlir/lib/AsmParser/AttributeParser.cpp b/mlir/lib/AsmParser/AttributeParser.cpp
index 3437ac9addc5ff..b1991ce06f6eab 100644
--- a/mlir/lib/AsmParser/AttributeParser.cpp
+++ b/mlir/lib/AsmParser/AttributeParser.cpp
@@ -735,8 +735,7 @@ DenseElementsAttr TensorLiteralParser::getHexAttr(SMLoc loc, ShapedType type) {
     return nullptr;
   }
 
-  if (llvm::support::endian::system_endianness() ==
-      llvm::support::endianness::big) {
+  if (llvm::endianness::native == llvm::endianness::big) {
     // Convert endianess in big-endian(BE) machines. `rawData` is
     // little-endian(LE) because HEX in raw data of dense element attribute
     // is always LE format. It is converted into BE here to be used in BE
diff --git a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
index 3061e042c851d9..8e9e0b6baf76c2 100644
--- a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
+++ b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
@@ -28,13 +28,13 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) {
       .value("singleton_nu", MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NU)
       .value("singleton_no", MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NO)
       .value("singleton_nu_no", MLIR_SPARSE_TENSOR_DIM_LEVEL_SINGLETON_NU_NO)
-      .value("compressed_hi", MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_WITH_HI)
-      .value("compressed_hi_nu",
-             MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_WITH_HI_NU)
-      .value("compressed_hi_no",
-             MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_WITH_HI_NO)
-      .value("compressed_hi_nu_no",
-             MLIR_SPARSE_TENSOR_DIM_LEVEL_COMPRESSED_WITH_HI_NU_NO);
+      .value("loose_compressed", MLIR_SPARSE_TENSOR_DIM_LEVEL_LOOSE_COMPRESSED)
+      .value("loose_compressed_nu",
+             MLIR_SPARSE_TENSOR_DIM_LEVEL_LOOSE_COMPRESSED_NU)
+      .value("loose_compressed_no",
+             MLIR_SPARSE_TENSOR_DIM_LEVEL_LOOSE_COMPRESSED_NO)
+      .value("loose_compressed_nu_no",
+             MLIR_SPARSE_TENSOR_DIM_LEVEL_LOOSE_COMPRESSED_NU_NO);
 
   mlir_attribute_subclass(m, "EncodingAttr",
                           mlirAttributeIsASparseTensorEncodingAttr)
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index aad74f511e7ef1..c8373e06f0db77 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2207,9 +2207,9 @@ class PyBlockArgumentList
 };
 
 /// A list of operation operands. Internally, these are stored as consecutive
-/// elements, random access is cheap. The result list is associated with the
-/// operation whose results these are, and extends the lifetime of this
-/// operation.
+/// elements, random access is cheap. The (returned) operand list is associated
+/// with the operation whose operands these are, and thus extends the lifetime
+/// of this operation.
 class PyOpOperandList : public Sliceable<PyOpOperandList, PyValue> {
 public:
   static constexpr const char *pyClassName = "OpOperandList";
@@ -2262,9 +2262,9 @@ class PyOpOperandList : public Sliceable<PyOpOperandList, PyValue> {
 };
 
 /// A list of operation results. Internally, these are stored as consecutive
-/// elements, random access is cheap. The result list is associated with the
-/// operation whose results these are, and extends the lifetime of this
-/// operation.
+/// elements, random access is cheap. The (returned) result list is associated
+/// with the operation whose results these are, and thus extends the lifetime of
+/// this operation.
 class PyOpResultList : public Sliceable<PyOpResultList, PyOpResult> {
 public:
   static constexpr const char *pyClassName = "OpResultList";
@@ -2307,6 +2307,52 @@ class PyOpResultList : public Sliceable<PyOpResultList, PyOpResult> {
   PyOperationRef operation;
 };
 
+/// A list of operation successors. Internally, these are stored as consecutive
+/// elements, random access is cheap. The (returned) successor list is
+/// associated with the operation whose successors these are, and thus extends
+/// the lifetime of this operation.
+class PyOpSuccessors : public Sliceable<PyOpSuccessors, PyBlock> {
+public:
+  static constexpr const char *pyClassName = "OpSuccessors";
+
+  PyOpSuccessors(PyOperationRef operation, intptr_t startIndex = 0,
+                 intptr_t length = -1, intptr_t step = 1)
+      : Sliceable(startIndex,
+                  length == -1 ? mlirOperationGetNumSuccessors(operation->get())
+                               : length,
+                  step),
+        operation(operation) {}
+
+  void dunderSetItem(intptr_t index, PyBlock block) {
+    index = wrapIndex(index);
+    mlirOperationSetSuccessor(operation->get(), index, block.get());
+  }
+
+  static void bindDerived(ClassTy &c) {
+    c.def("__setitem__", &PyOpSuccessors::dunderSetItem);
+  }
+
+private:
+  /// Give the parent CRTP class access to hook implementations below.
+  friend class Sliceable<PyOpSuccessors, PyBlock>;
+
+  intptr_t getRawNumElements() {
+    operation->checkValid();
+    return mlirOperationGetNumSuccessors(operation->get());
+  }
+
+  PyBlock getRawElement(intptr_t pos) {
+    MlirBlock block = mlirOperationGetSuccessor(operation->get(), pos);
+    return PyBlock(operation, block);
+  }
+
+  PyOpSuccessors slice(intptr_t startIndex, intptr_t length, intptr_t step) {
+    return PyOpSuccessors(operation, startIndex, length, step);
+  }
+
+  PyOperationRef operation;
+};
+
 /// A list of operation attributes. Can be indexed by name, producing
 /// attributes, or by index, producing named attributes.
 class PyOpAttributeMap {
@@ -2924,16 +2970,28 @@ void mlir::python::populateIRCore(py::module &m) {
                              &PyOperation::getCapsule)
       .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyOperation::createFromCapsule)
       .def_property_readonly("operation", [](py::object self) { return self; })
-      .def_property_readonly("opview", &PyOperation::createOpView);
+      .def_property_readonly("opview", &PyOperation::createOpView)
+      .def_property_readonly(
+          "successors",
+          [](PyOperationBase &self) {
+            return PyOpSuccessors(self.getOperation().getRef());
+          },
+          "Returns the list of Operation successors.");
 
   auto opViewClass =
       py::class_<PyOpView, PyOperationBase>(m, "OpView", py::module_local())
           .def(py::init<py::object>(), py::arg("operation"))
           .def_property_readonly("operation", &PyOpView::getOperationObject)
           .def_property_readonly("opview", [](py::object self) { return self; })
-          .def("__str__", [](PyOpView &self) {
-            return py::str(self.getOperationObject());
-          });
+          .def(
+              "__str__",
+              [](PyOpView &self) { return py::str(self.getOperationObject()); })
+          .def_property_readonly(
+              "successors",
+              [](PyOperationBase &self) {
+                return PyOpSuccessors(self.getOperation().getRef());
+              },
+              "Returns the list of Operation successors.");
   opViewClass.attr("_ODS_REGIONS") = py::make_tuple(0, true);
   opViewClass.attr("_ODS_OPERAND_SEGMENTS") = py::none();
   opViewClass.attr("_ODS_RESULT_SEGMENTS") = py::none();
@@ -3430,35 +3488,32 @@ void mlir::python::populateIRCore(py::module &m) {
           kValueDunderStrDocstring)
       .def(
           "get_name",
-          [](PyValue &self, std::optional<bool> useLocalScope,
-             std::optional<std::reference_wrapper<PyAsmState>> state) {
+          [](PyValue &self, bool useLocalScope) {
             PyPrintAccumulator printAccum;
-            MlirOpPrintingFlags flags;
-            MlirAsmState valueState;
-            // Use state if provided, else create a new state.
-            if (state) {
-              valueState = state.value().get().get();
-              // Don't allow setting using local scope and state at same time.
-              if (useLocalScope.has_value())
-                throw py::value_error(
-                    "setting AsmState and local scope together not supported");
-            } else {
-              flags = mlirOpPrintingFlagsCreate();
-              if (useLocalScope.value_or(false))
-                mlirOpPrintingFlagsUseLocalScope(flags);
-              valueState = mlirAsmStateCreateForValue(self.get(), flags);
-            }
-            mlirValuePrintAsOperand(self.get(), valueState, printAccum.getCallback(),
+            MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate();
+            if (useLocalScope)
+              mlirOpPrintingFlagsUseLocalScope(flags);
+            MlirAsmState valueState =
+                mlirAsmStateCreateForValue(self.get(), flags);
+            mlirValuePrintAsOperand(self.get(), valueState,
+                                    printAccum.getCallback(),
+                                    printAccum.getUserData());
+            mlirOpPrintingFlagsDestroy(flags);
+            mlirAsmStateDestroy(valueState);
+            return printAccum.join();
+          },
+          py::arg("use_local_scope") = false)
+      .def(
+          "get_name",
+          [](PyValue &self, std::reference_wrapper<PyAsmState> state) {
+            PyPrintAccumulator printAccum;
+            MlirAsmState valueState = state.get().get();
+            mlirValuePrintAsOperand(self.get(), valueState,
+                                    printAccum.getCallback(),
                                     printAccum.getUserData());
-            // Release state if allocated locally.
-            if (!state) {
-              mlirOpPrintingFlagsDestroy(flags);
-              mlirAsmStateDestroy(valueState);
-            }
             return printAccum.join();
           },
-          py::arg("use_local_scope") = std::nullopt,
-          py::arg("state") = std::nullopt, kGetNameAsOperand)
+          py::arg("state"), kGetNameAsOperand)
       .def_property_readonly(
           "type", [](PyValue &self) { return mlirValueGetType(self.get()); })
       .def(
@@ -3523,6 +3578,7 @@ void mlir::python::populateIRCore(py::module &m) {
   PyOpOperandIterator::bind(m);
   PyOpOperandList::bind(m);
   PyOpResultList::bind(m);
+  PyOpSuccessors::bind(m);
   PyRegionIterator::bind(m);
   PyRegionList::bind(m);
 
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index 65b2b7466fb7c3..c1abbbe364611a 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -637,6 +637,11 @@ bool mlirOperationRemoveDiscardableAttributeByName(MlirOperation op,
   return !!unwrap(op)->removeDiscardableAttr(unwrap(name));
 }
 
+void mlirOperationSetSuccessor(MlirOperation op, intptr_t pos,
+                               MlirBlock block) {
+  unwrap(op)->setSuccessor(unwrap(block), static_cast<unsigned>(pos));
+}
+
 intptr_t mlirOperationGetNumAttributes(MlirOperation op) {
   return static_cast<intptr_t>(unwrap(op)->getAttrs().size());
 }
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 7de7f3cb9e36b0..d52f01880282e1 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -33,6 +33,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Support/MathExtras.h"
@@ -48,6 +49,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include <algorithm>
 #include <functional>
+#include <optional>
 
 namespace mlir {
 #define GEN_PASS_DEF_CONVERTFUNCTOLLVMPASS
@@ -601,19 +603,38 @@ struct CallOpInterfaceLowering : public ConvertOpToLLVMPattern<CallOpType> {
   }
 };
 
-struct CallOpLowering : public CallOpInterfaceLowering<func::CallOp> {
-  using Super::Super;
+class CallOpLowering : public CallOpInterfaceLowering<func::CallOp> {
+public:
+  CallOpLowering(const LLVMTypeConverter &typeConverter,
+                 // Can be nullptr.
+                 const SymbolTable *symbolTable, PatternBenefit benefit = 1)
+      : CallOpInterfaceLowering<func::CallOp>(typeConverter, benefit),
+        symbolTable(symbolTable) {}
 
   LogicalResult
   matchAndRewrite(func::CallOp callOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     bool useBarePtrCallConv = false;
-    if (Operation *callee = SymbolTable::lookupNearestSymbolFrom(
-            callOp, callOp.getCalleeAttr())) {
-      useBarePtrCallConv = shouldUseBarePtrCallConv(callee, getTypeConverter());
+    if (getTypeConverter()->getOptions().useBarePtrCallConv) {
+      useBarePtrCallConv = true;
+    } else if (symbolTable != nullptr) {
+      // Fast lookup.
+      Operation *callee =
+          symbolTable->lookup(callOp.getCalleeAttr().getValue());
+      useBarePtrCallConv =
+          callee != nullptr && callee->hasAttr(barePtrAttrName);
+    } else {
+      // Warning: This is a linear lookup.
+      Operation *callee =
+          SymbolTable::lookupNearestSymbolFrom(callOp, callOp.getCalleeAttr());
+      useBarePtrCallConv =
+          callee != nullptr && callee->hasAttr(barePtrAttrName);
     }
     return matchAndRewriteImpl(callOp, adaptor, rewriter, useBarePtrCallConv);
   }
+
+private:
+  const SymbolTable *symbolTable = nullptr;
 };
 
 struct CallIndirectOpLowering
@@ -728,16 +749,14 @@ void mlir::populateFuncToLLVMFuncOpConversionPattern(
   patterns.add<FuncOpConversion>(converter);
 }
 
-void mlir::populateFuncToLLVMConversionPatterns(LLVMTypeConverter &converter,
-                                                RewritePatternSet &patterns) {
+void mlir::populateFuncToLLVMConversionPatterns(
+    LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    const SymbolTable *symbolTable) {
   populateFuncToLLVMFuncOpConversionPattern(converter, patterns);
-  // clang-format off
-  patterns.add<
-      CallIndirectOpLowering,
-      CallOpLowering,
-      ConstantOpLowering,
-      ReturnOpLowering>(converter);
-  // clang-format on
+  patterns.add<CallIndirectOpLowering>(converter);
+  patterns.add<CallOpLowering>(converter, symbolTable);
+  patterns.add<ConstantOpLowering>(converter);
+  patterns.add<ReturnOpLowering>(converter);
 }
 
 namespace {
@@ -776,8 +795,15 @@ struct ConvertFuncToLLVMPass
     LLVMTypeConverter typeConverter(&getContext(), options,
                                     &dataLayoutAnalysis);
 
+    std::optional<SymbolTable> optSymbolTable = std::nullopt;
+    const SymbolTable *symbolTable = nullptr;
+    if (!options.useBarePtrCallConv) {
+      optSymbolTable.emplace(m);
+      symbolTable = &optSymbolTable.value();
+    }
+
     RewritePatternSet patterns(&getContext());
-    populateFuncToLLVMConversionPatterns(typeConverter, patterns);
+    populateFuncToLLVMConversionPatterns(typeConverter, patterns, symbolTable);
 
     // TODO: Remove these in favor of their dedicated conversion passes.
     arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/CMakeLists.txt b/mlir/lib/Conversion/NVGPUToNVVM/CMakeLists.txt
index 00e775ce7dd22b..a050749eb7da87 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/NVGPUToNVVM/CMakeLists.txt
@@ -17,6 +17,7 @@ add_mlir_conversion_library(MLIRNVGPUToNVVM
   MLIRLLVMDialect
   MLIRNVGPUDialect
   MLIRNVVMDialect
+  MLIRArithDialect
   MLIRPass
   MLIRSCFTransforms
   MLIRTransforms
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 4d1f6641af6dca..99c4d422335135 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
@@ -19,6 +20,7 @@
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
@@ -38,19 +40,18 @@ namespace mlir {
 
 using namespace mlir;
 
-/// Number of bits that needs to excluded when building matrix descriptor for
+/// Number of bits that needs to be excluded when building matrix descriptor for
 /// wgmma operations.
 constexpr int exclude4LSB = 4;
 
 /// GPU has 32 bit registers, this function truncates values when larger width
 /// is not needed.
-static Value truncToI32(ConversionPatternRewriter &rewriter, Location loc,
-                        Value value) {
+static Value truncToI32(ImplicitLocOpBuilder &b, Value value) {
   Type type = value.getType();
   assert(llvm::isa<IntegerType>(type) && "expected an integer Value");
   if (type.getIntOrFloatBitWidth() <= 32)
     return value;
-  return rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), value);
+  return b.create<LLVM::TruncOp>(b.getI32Type(), value);
 }
 
 /// Returns the type for the intrinsic given the vectorResultType of the
@@ -170,22 +171,21 @@ static Value convertIntrinsicResult(Location loc, Type intrinsicResultType,
 /// `nvvm.mma.sync` op expects these argments to be a given in a long list of
 /// scalars of certain types. This function helps unpack the `vector` arguments
 /// and cast them to the types expected by `nvvm.mma.sync`.
-static SmallVector<Value> unpackOperandVector(RewriterBase &rewriter,
-                                              Location loc, Value operand,
+static SmallVector<Value> unpackOperandVector(ImplicitLocOpBuilder &b,
+                                              Value operand,
                                               NVVM::MMATypes operandPtxType) {
   SmallVector<Value> result;
-  Type i32Ty = rewriter.getI32Type();
-  Type f64Ty = rewriter.getF64Type();
-  Type f32Ty = rewriter.getF32Type();
-  Type i8Ty = rewriter.getI8Type();
-  Type i4Ty = rewriter.getIntegerType(4);
-  Type i8x4Ty = LLVM::getFixedVectorType(i8Ty, 4);
-  Type i4x8Ty = LLVM::getFixedVectorType(i4Ty, 8);
+  Type i32Ty = b.getI32Type();
+  Type f64Ty = b.getF64Type();
+  Type f32Ty = b.getF32Type();
+  Type i64Ty = b.getI64Type();
+  Type i8x4Ty = LLVM::getFixedVectorType(b.getI8Type(), 4);
+  Type i4x8Ty = LLVM::getFixedVectorType(b.getIntegerType(4), 8);
   Type f32x1Ty = LLVM::getFixedVectorType(f32Ty, 1);
   auto arrayTy = cast<LLVM::LLVMArrayType>(operand.getType());
 
   for (unsigned i = 0, e = arrayTy.getNumElements(); i < e; ++i) {
-    Value toUse = rewriter.create<LLVM::ExtractValueOp>(loc, operand, i);
+    Value toUse = b.create<LLVM::ExtractValueOp>(operand, i);
 
     // For 4xi8 vectors, the intrinsic expects these to be provided as i32
     // scalar types.
@@ -193,8 +193,7 @@ static SmallVector<Value> unpackOperandVector(RewriterBase &rewriter,
         arrayTy.getElementType() == i4x8Ty ||
         (arrayTy.getElementType() == f32x1Ty &&
          operandPtxType == NVVM::MMATypes::tf32)) {
-      result.push_back(
-          rewriter.create<LLVM::BitcastOp>(loc, rewriter.getI32Type(), toUse));
+      result.push_back(b.create<LLVM::BitcastOp>(i32Ty, toUse));
       continue;
     }
 
@@ -207,10 +206,9 @@ static SmallVector<Value> unpackOperandVector(RewriterBase &rewriter,
                          innerArrayTy.getElementType() == f32Ty)) {
       for (unsigned idx = 0, innerSize = innerArrayTy.getNumElements();
            idx < innerSize; idx++) {
-        result.push_back(rewriter.create<LLVM::ExtractElementOp>(
-            loc, toUse,
-            rewriter.create<LLVM::ConstantOp>(
-                loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(idx))));
+        result.push_back(b.create<LLVM::ExtractElementOp>(
+            toUse,
+            b.create<LLVM::ConstantOp>(i64Ty, b.getI64IntegerAttr(idx))));
       }
       continue;
     }
@@ -256,7 +254,7 @@ struct MmaLdMatrixOpToNVVM : public ConvertOpToLLVMPattern<nvgpu::LdMatrixOp> {
   matchAndRewrite(nvgpu::LdMatrixOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     MLIRContext *ctx = getContext();
-    Location loc = op->getLoc();
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     // The result type of ldmatrix will always be a struct of 32bit integer
     // registers if more than one 32bit value is returned. Otherwise, the result
@@ -283,10 +281,10 @@ struct MmaLdMatrixOpToNVVM : public ConvertOpToLLVMPattern<nvgpu::LdMatrixOp> {
 
     auto srcMemrefType = cast<MemRefType>(op.getSrcMemref().getType());
     Value srcPtr =
-        getStridedElementPtr(loc, srcMemrefType, adaptor.getSrcMemref(),
+        getStridedElementPtr(b.getLoc(), srcMemrefType, adaptor.getSrcMemref(),
                              adaptor.getIndices(), rewriter);
-    Value ldMatrixResult = rewriter.create<NVVM::LdMatrixOp>(
-        loc, ldMatrixResultType, srcPtr,
+    Value ldMatrixResult = b.create<NVVM::LdMatrixOp>(
+        ldMatrixResultType, srcPtr,
         /*num=*/op.getNumTiles(),
         /*layout=*/op.getTranspose() ? NVVM::MMALayout::col
                                      : NVVM::MMALayout::row);
@@ -296,15 +294,13 @@ struct MmaLdMatrixOpToNVVM : public ConvertOpToLLVMPattern<nvgpu::LdMatrixOp> {
     // actual vector type (still of width 32b) and repack them into a result
     // struct.
     Type finalResultType = typeConverter->convertType(vectorResultType);
-    Value result = rewriter.create<LLVM::UndefOp>(loc, finalResultType);
+    Value result = b.create<LLVM::UndefOp>(finalResultType);
     for (int64_t i = 0, e = vectorResultType.getDimSize(0); i < e; i++) {
       Value i32Register =
-          num32BitRegs > 1
-              ? rewriter.create<LLVM::ExtractValueOp>(loc, ldMatrixResult, i)
-              : ldMatrixResult;
-      Value casted =
-          rewriter.create<LLVM::BitcastOp>(loc, innerVectorType, i32Register);
-      result = rewriter.create<LLVM::InsertValueOp>(loc, result, casted, i);
+          num32BitRegs > 1 ? b.create<LLVM::ExtractValueOp>(ldMatrixResult, i)
+                           : ldMatrixResult;
+      Value casted = b.create<LLVM::BitcastOp>(innerVectorType, i32Register);
+      result = b.create<LLVM::InsertValueOp>(result, casted, i);
     }
 
     rewriter.replaceOp(op, result);
@@ -335,7 +331,7 @@ struct MmaSyncOptoNVVM : public ConvertOpToLLVMPattern<nvgpu::MmaSyncOp> {
   LogicalResult
   matchAndRewrite(nvgpu::MmaSyncOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Location loc = op->getLoc();
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     // Get the shapes of the MMAMatrix type being used. The shapes will
     // choose which intrinsic this op will be lowered to.
     VectorType aType = op.getMatrixA().getType();
@@ -368,17 +364,17 @@ struct MmaSyncOptoNVVM : public ConvertOpToLLVMPattern<nvgpu::MmaSyncOp> {
       overflow = NVVM::MMAIntOverflow::satfinite;
 
     SmallVector<Value> matA =
-        unpackOperandVector(rewriter, loc, adaptor.getMatrixA(), *ptxTypeA);
+        unpackOperandVector(b, adaptor.getMatrixA(), *ptxTypeA);
     SmallVector<Value> matB =
-        unpackOperandVector(rewriter, loc, adaptor.getMatrixB(), *ptxTypeB);
+        unpackOperandVector(b, adaptor.getMatrixB(), *ptxTypeB);
     SmallVector<Value> matC =
-        unpackOperandVector(rewriter, loc, adaptor.getMatrixC(), *ptxTypeC);
+        unpackOperandVector(b, adaptor.getMatrixC(), *ptxTypeC);
 
     Type desiredRetTy = typeConverter->convertType(op->getResultTypes()[0]);
     Type intrinsicResTy = inferIntrinsicResultType(
         typeConverter->convertType(op->getResultTypes()[0]));
-    Value intrinsicResult = rewriter.create<NVVM::MmaOp>(
-        op.getLoc(), intrinsicResTy, matA, matB, matC,
+    Value intrinsicResult = b.create<NVVM::MmaOp>(
+        intrinsicResTy, matA, matB, matC,
         /*shape=*/gemmShape,
         /*b1Op=*/std::nullopt,
         /*intOverflow=*/overflow,
@@ -399,8 +395,8 @@ struct ConvertNVGPUToNVVMPass
   using Base::Base;
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<memref::MemRefDialect, LLVM::LLVMDialect, NVVM::NVVMDialect>();
+    registry.insert<memref::MemRefDialect, LLVM::LLVMDialect, NVVM::NVVMDialect,
+                    arith::ArithDialect>();
   }
 
   void runOnOperation() override {
@@ -441,6 +437,7 @@ struct ConvertNVGPUToNVVMPass
     populateNVGPUToNVVMConversionPatterns(converter, patterns);
     LLVMConversionTarget target(getContext());
     target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
+    target.addLegalDialect<::mlir::arith::ArithDialect>();
     target.addLegalDialect<::mlir::memref::MemRefDialect>();
     target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
     mlir::scf::populateSCFStructuralTypeConversionsAndLegality(
@@ -511,14 +508,14 @@ static std::string buildMmaSparseAsmString(
 /// Builds an inline assembly operation corresponding to the specified MMA
 /// sparse sync operation.
 static FailureOr<LLVM::InlineAsmOp> emitMmaSparseSyncOpAsm(
-    Location loc, NVVM::MMATypes ptxTypeA, NVVM::MMATypes ptxTypeB,
+    ImplicitLocOpBuilder &b, NVVM::MMATypes ptxTypeA, NVVM::MMATypes ptxTypeB,
     NVVM::MMATypes ptxTypeC, NVVM::MMATypes ptxTypeD,
     std::optional<NVVM::MMAIntOverflow> overflow, ArrayRef<Value> unpackedAData,
     ArrayRef<Value> unpackedB, ArrayRef<Value> unpackedC, Value indexData,
     int64_t metadataSelector, const std::array<int64_t, 3> &shape,
-    Type intrinsicResultType, ConversionPatternRewriter &rewriter) {
-  auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
-                                                  LLVM::AsmDialect::AD_ATT);
+    Type intrinsicResultType) {
+  auto asmDialectAttr =
+      LLVM::AsmDialectAttr::get(b.getContext(), LLVM::AsmDialect::AD_ATT);
 
   const unsigned matASize = unpackedAData.size();
   const unsigned matBSize = unpackedB.size();
@@ -536,15 +533,15 @@ static FailureOr<LLVM::InlineAsmOp> emitMmaSparseSyncOpAsm(
     llvm::append_range(asmVals, args);
   asmVals.push_back(indexData);
 
-  return rewriter.create<LLVM::InlineAsmOp>(loc,
-                                            /*resultTypes=*/intrinsicResultType,
-                                            /*operands=*/asmVals,
-                                            /*asm_string=*/asmStr,
-                                            /*constraints=*/constraintStr,
-                                            /*has_side_effects=*/true,
-                                            /*is_align_stack=*/false,
-                                            /*asm_dialect=*/asmDialectAttr,
-                                            /*operand_attrs=*/ArrayAttr());
+  return b.create<LLVM::InlineAsmOp>(
+      /*resultTypes=*/intrinsicResultType,
+      /*operands=*/asmVals,
+      /*asm_string=*/asmStr,
+      /*constraints=*/constraintStr,
+      /*has_side_effects=*/true,
+      /*is_align_stack=*/false,
+      /*asm_dialect=*/asmDialectAttr,
+      /*operand_attrs=*/ArrayAttr());
 }
 
 /// Lowers `nvgpu.mma.sp.sync` to inline assembly.
@@ -555,7 +552,7 @@ struct NVGPUMmaSparseSyncLowering
   LogicalResult
   matchAndRewrite(nvgpu::MmaSparseSyncOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Location loc = op->getLoc();
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     // Get the shapes of the MMAMatrix type being used. The shapes will
     // choose which intrinsic this op will be lowered to.
     VectorType aType = op.getMatrixA().getType();
@@ -586,11 +583,11 @@ struct NVGPUMmaSparseSyncLowering
       overflow = NVVM::MMAIntOverflow::satfinite;
 
     SmallVector<Value> matA =
-        unpackOperandVector(rewriter, loc, adaptor.getMatrixA(), *ptxTypeA);
+        unpackOperandVector(b, adaptor.getMatrixA(), *ptxTypeA);
     SmallVector<Value> matB =
-        unpackOperandVector(rewriter, loc, adaptor.getMatrixB(), *ptxTypeB);
+        unpackOperandVector(b, adaptor.getMatrixB(), *ptxTypeB);
     SmallVector<Value> matC =
-        unpackOperandVector(rewriter, loc, adaptor.getMatrixC(), *ptxTypeC);
+        unpackOperandVector(b, adaptor.getMatrixC(), *ptxTypeC);
 
     Type desiredRetTy = typeConverter->convertType(op->getResultTypes()[0]);
     Type intrinsicResTy = inferIntrinsicResultType(
@@ -602,13 +599,13 @@ struct NVGPUMmaSparseSyncLowering
         LLVM::getFixedVectorType(rewriter.getI16Type(), 2))
       return op->emitOpError() << "Expected metadata type to be LLVM "
                                   "VectorType of 2 i16 elements";
-    sparseMetadata = rewriter.create<LLVM::BitcastOp>(
-        loc, rewriter.getI32Type(), sparseMetadata);
+    sparseMetadata =
+        b.create<LLVM::BitcastOp>(rewriter.getI32Type(), sparseMetadata);
 
     FailureOr<LLVM::InlineAsmOp> intrinsicResult = emitMmaSparseSyncOpAsm(
-        loc, *ptxTypeA, *ptxTypeB, *ptxTypeC, *ptxTypeC, overflow, matA, matB,
+        b, *ptxTypeA, *ptxTypeB, *ptxTypeC, *ptxTypeC, overflow, matA, matB,
         matC, sparseMetadata, op.getSparsitySelector(), op.getMmaShapeAsArray(),
-        intrinsicResTy, rewriter);
+        intrinsicResTy);
     if (failed(intrinsicResult))
       return failure();
 
@@ -629,10 +626,12 @@ struct NVGPUAsyncCopyLowering
   LogicalResult
   matchAndRewrite(nvgpu::DeviceAsyncCopyOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Location loc = op->getLoc();
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    Location loc = op.getLoc();
     auto dstMemrefType = cast<MemRefType>(op.getDst().getType());
-    Value dstPtr = getStridedElementPtr(loc, dstMemrefType, adaptor.getDst(),
-                                        adaptor.getDstIndices(), rewriter);
+    Value dstPtr =
+        getStridedElementPtr(b.getLoc(), dstMemrefType, adaptor.getDst(),
+                             adaptor.getDstIndices(), rewriter);
     auto i8Ty = IntegerType::get(op.getContext(), 8);
     FailureOr<unsigned> dstAddressSpace =
         getTypeConverter()->getMemRefAddressSpace(dstMemrefType);
@@ -642,7 +641,7 @@ struct NVGPUAsyncCopyLowering
     auto dstPointerType =
         getTypeConverter()->getPointerType(i8Ty, *dstAddressSpace);
     if (!getTypeConverter()->useOpaquePointers())
-      dstPtr = rewriter.create<LLVM::BitcastOp>(loc, dstPointerType, dstPtr);
+      dstPtr = b.create<LLVM::BitcastOp>(dstPointerType, dstPtr);
 
     auto srcMemrefType = cast<MemRefType>(op.getSrc().getType());
     FailureOr<unsigned> srcAddressSpace =
@@ -656,12 +655,11 @@ struct NVGPUAsyncCopyLowering
     auto srcPointerType =
         getTypeConverter()->getPointerType(i8Ty, *srcAddressSpace);
     if (!getTypeConverter()->useOpaquePointers())
-      scrPtr = rewriter.create<LLVM::BitcastOp>(loc, srcPointerType, scrPtr);
+      scrPtr = b.create<LLVM::BitcastOp>(srcPointerType, scrPtr);
     // Intrinsics takes a global pointer so we need an address space cast.
     auto srcPointerGlobalType = getTypeConverter()->getPointerType(
         i8Ty, NVVM::NVVMMemorySpace::kGlobalMemorySpace);
-    scrPtr = rewriter.create<LLVM::AddrSpaceCastOp>(loc, srcPointerGlobalType,
-                                                    scrPtr);
+    scrPtr = b.create<LLVM::AddrSpaceCastOp>(srcPointerGlobalType, scrPtr);
     int64_t dstElements = adaptor.getDstElements().getZExtValue();
     int64_t sizeInBytes =
         (dstMemrefType.getElementTypeBitWidth() * dstElements) / 8;
@@ -675,16 +673,14 @@ struct NVGPUAsyncCopyLowering
       // memory) of CpAsyncOp is read only for SrcElements number of elements.
       // The rest of the DstElements in the destination (shared memory) are
       // filled with zeros.
-      Value c3I32 = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(3));
-      Value bitwidth = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32Type(),
-          rewriter.getI32IntegerAttr(srcMemrefType.getElementTypeBitWidth()));
-      Value srcElementsI32 =
-          rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), srcBytes);
-      srcBytes = rewriter.create<LLVM::LShrOp>(
-          loc, rewriter.create<LLVM::MulOp>(loc, bitwidth, srcElementsI32),
-          c3I32);
+      Value c3I32 =
+          b.create<LLVM::ConstantOp>(b.getI32Type(), b.getI32IntegerAttr(3));
+      Value bitwidth = b.create<LLVM::ConstantOp>(
+          b.getI32Type(),
+          b.getI32IntegerAttr(srcMemrefType.getElementTypeBitWidth()));
+      Value srcElementsI32 = b.create<LLVM::TruncOp>(b.getI32Type(), srcBytes);
+      srcBytes = b.create<LLVM::LShrOp>(
+          b.create<LLVM::MulOp>(bitwidth, srcElementsI32), c3I32);
     }
     // Cache global (.cg) for 16 dst bytes, Cache all (.ca) for sizes other than
     // 16 dst bytes.
@@ -693,15 +689,14 @@ struct NVGPUAsyncCopyLowering
             ? NVVM::LoadCacheModifierKind::CG
             : NVVM::LoadCacheModifierKind::CA;
 
-    rewriter.create<NVVM::CpAsyncOp>(
-        loc, dstPtr, scrPtr, rewriter.getI32IntegerAttr(sizeInBytes),
+    b.create<NVVM::CpAsyncOp>(
+        dstPtr, scrPtr, rewriter.getI32IntegerAttr(sizeInBytes),
         NVVM::LoadCacheModifierKindAttr::get(op->getContext(), cacheModifier),
         srcBytes);
 
     // Drop the result token.
-    Value zero = rewriter.create<LLVM::ConstantOp>(
-        op->getLoc(), IntegerType::get(op.getContext(), 32),
-        rewriter.getI32IntegerAttr(0));
+    Value zero = b.create<LLVM::ConstantOp>(
+        IntegerType::get(op.getContext(), 32), rewriter.getI32IntegerAttr(0));
     rewriter.replaceOp(op, zero);
     return success();
   }
@@ -790,14 +785,14 @@ struct MBarrierBasePattern : public ConvertOpToLLVMPattern<SourceOp> {
 public:
   using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
   /// Returns the base pointer of the mbarrier object.
-  Value getMbarrierPtr(Operation *op, nvgpu::MBarrierGroupType mbarType,
-                       Value memrefDesc, Value mbarId,
+  Value getMbarrierPtr(ImplicitLocOpBuilder &b,
+                       nvgpu::MBarrierGroupType mbarType, Value memrefDesc,
+                       Value mbarId,
                        ConversionPatternRewriter &rewriter) const {
     MemRefType mbarrierMemrefType =
         nvgpu::getMBarrierMemrefType(rewriter.getContext(), mbarType);
     return ConvertToLLVMPattern::getStridedElementPtr(
-        op->getLoc(), mbarrierMemrefType, memrefDesc, {mbarId}, rewriter);
-    return memrefDesc;
+        b.getLoc(), mbarrierMemrefType, memrefDesc, {mbarId}, rewriter);
   }
 };
 
@@ -809,11 +804,12 @@ struct NVGPUMBarrierInitLowering
   LogicalResult
   matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     nvgpu::MBarrierGroupType mbarrierType = op.getBarriers().getType();
     rewriter.setInsertionPoint(op);
-    Value barrier = getMbarrierPtr(op, mbarrierType, adaptor.getBarriers(),
+    Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(),
                                    adaptor.getMbarId(), rewriter);
-    Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
+    Value count = truncToI32(b, adaptor.getCount());
     if (isMbarrierShared(mbarrierType)) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier,
                                                               count);
@@ -831,8 +827,9 @@ struct NVGPUMBarrierArriveLowering
   LogicalResult
   matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     Value barrier =
-        getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
@@ -856,12 +853,13 @@ struct NVGPUMBarrierArriveNoCompleteLowering
   LogicalResult
   matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     Value barrier =
-        getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
-    Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
+    Value count = truncToI32(b, adaptor.getCount());
     if (isMbarrierShared(op.getBarriers().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
           op, tokenType, barrier, count);
@@ -880,8 +878,9 @@ struct NVGPUMBarrierTestWaitLowering
   LogicalResult
   matchAndRewrite(nvgpu::MBarrierTestWaitOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     Value barrier =
-        getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Type retType = rewriter.getI1Type();
     if (isMbarrierShared(op.getBarriers().getType())) {
@@ -902,10 +901,11 @@ struct NVGPUMBarrierArriveExpectTxLowering
   LogicalResult
   matchAndRewrite(nvgpu::MBarrierArriveExpectTxOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     Value barrier =
-        getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
-    Value txcount = truncToI32(rewriter, op->getLoc(), adaptor.getTxcount());
+    Value txcount = truncToI32(b, adaptor.getTxcount());
 
     if (isMbarrierShared(op.getBarriers().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxSharedOp>(
@@ -926,11 +926,12 @@ struct NVGPUMBarrierTryWaitParityLowering
   LogicalResult
   matchAndRewrite(nvgpu::MBarrierTryWaitParityOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     Value barrier =
-        getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
-    Value ticks = truncToI32(rewriter, op->getLoc(), adaptor.getTicks());
-    Value phase = truncToI32(rewriter, op->getLoc(), adaptor.getPhase());
+    Value ticks = truncToI32(b, adaptor.getTicks());
+    Value phase = truncToI32(b, adaptor.getPhase());
 
     if (isMbarrierShared(op.getBarriers().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>(
@@ -950,16 +951,17 @@ struct NVGPUTmaAsyncLoadOpLowering
   LogicalResult
   matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     auto srcMemrefType = cast<MemRefType>(op.getDst().getType());
     Value dest = getStridedElementPtr(op->getLoc(), srcMemrefType,
                                       adaptor.getDst(), {}, rewriter);
     Value barrier =
-        getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+        getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
 
     SmallVector<Value> coords = adaptor.getCoordinates();
     for (auto [index, value] : llvm::enumerate(coords)) {
-      coords[index] = truncToI32(rewriter, op->getLoc(), value);
+      coords[index] = truncToI32(b, value);
     }
 
     rewriter.replaceOpWithNewOp<NVVM::CpAsyncBulkTensorGlobalToSharedClusterOp>(
@@ -967,16 +969,16 @@ struct NVGPUTmaAsyncLoadOpLowering
     return success();
   }
 };
-struct NVGPUGenerateGmmaDescriptorLowering
-    : public ConvertOpToLLVMPattern<nvgpu::GenerateGmmaDescriptorOp> {
+struct NVGPUGenerateWarpgroupDescriptorLowering
+    : public ConvertOpToLLVMPattern<nvgpu::WarpgroupGenerateDescriptorOp> {
   using ConvertOpToLLVMPattern<
-      nvgpu::GenerateGmmaDescriptorOp>::ConvertOpToLLVMPattern;
+      nvgpu::WarpgroupGenerateDescriptorOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(nvgpu::GenerateGmmaDescriptorOp op, OpAdaptor adaptor,
+  matchAndRewrite(nvgpu::WarpgroupGenerateDescriptorOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
 
-    Location loc = op->getLoc();
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
 
     nvgpu::TensorMapSwizzleKind swizzleKind =
         op.getTensorMap().getType().getSwizzle();
@@ -992,20 +994,18 @@ struct NVGPUGenerateGmmaDescriptorLowering
         : (swizzleKind == nvgpu::TensorMapSwizzleKind::SWIZZLE_32B) ? 3
                                                                     : 0;
 
-    auto ti64 = rewriter.getIntegerType(64);
+    auto ti64 = b.getIntegerType(64);
     auto makeConst = [&](uint64_t index) -> Value {
-      return rewriter.create<LLVM::ConstantOp>(
-          loc, ti64, rewriter.getI64IntegerAttr(index));
+      return b.create<LLVM::ConstantOp>(ti64, b.getI64IntegerAttr(index));
     };
     auto shiftLeft = [&](Value value, unsigned shift) -> Value {
-      return rewriter.create<LLVM::ShlOp>(loc, ti64, value, makeConst(shift));
+      return b.create<LLVM::ShlOp>(ti64, value, makeConst(shift));
     };
     auto shiftRight = [&](Value value, unsigned shift) -> Value {
-      return rewriter.create<LLVM::LShrOp>(loc, ti64, value, makeConst(shift));
+      return b.create<LLVM::LShrOp>(ti64, value, makeConst(shift));
     };
     auto insertBit = [&](Value desc, Value val, int startBit) {
-      return rewriter.create<LLVM::OrOp>(loc, ti64, desc,
-                                         shiftLeft(val, startBit));
+      return b.create<LLVM::OrOp>(ti64, desc, shiftLeft(val, startBit));
     };
 
     int64_t sizeN = op.getTensorMap().getType().getTensor().getDimSize(0);
@@ -1019,7 +1019,7 @@ struct NVGPUGenerateGmmaDescriptorLowering
     Value baseAddr = getStridedElementPtr(
         op->getLoc(), cast<MemRefType>(op.getTensor().getType()),
         adaptor.getTensor(), {}, rewriter);
-    Value basePtr = rewriter.create<LLVM::PtrToIntOp>(loc, ti64, baseAddr);
+    Value basePtr = b.create<LLVM::PtrToIntOp>(ti64, baseAddr);
     // Just use 14 bits for base address
     Value basePtr14bit = shiftRight(shiftLeft(basePtr, 46), 50);
 
@@ -1037,7 +1037,7 @@ struct NVGPUGenerateGmmaDescriptorLowering
     // // [0,14)   start_address
     dsc = insertBit(dsc, basePtr14bit, startBaseAddrBit);
 
-    LLVM_DEBUG(DBGS() << "Generating wgmma.descriptor: "
+    LLVM_DEBUG(DBGS() << "Generating warpgroup.descriptor: "
                       << "leading_off:" << leadDimVal << "\t"
                       << "stride_off :" << strideDimVal << "\t"
                       << "base_offset:" << offsetVal << "\t"
@@ -1050,16 +1050,13 @@ struct NVGPUGenerateGmmaDescriptorLowering
   }
 };
 
-static Value makeI64Const(RewriterBase &rewriter, Operation *op,
-                          int32_t index) {
-  return rewriter.create<LLVM::ConstantOp>(op->getLoc(),
-                                           rewriter.getIntegerType(64),
-                                           rewriter.getI32IntegerAttr(index));
+static Value makeI64Const(ImplicitLocOpBuilder &b, int32_t index) {
+  return b.create<LLVM::ConstantOp>(b.getIntegerType(64),
+                                    b.getI32IntegerAttr(index));
 }
 
 /// Returns a Value that holds data type enum that is expected by CUDA driver.
-static Value elementTypeAsLLVMConstant(RewriterBase &rewriter, Operation *op,
-                                       Type type) {
+static Value elementTypeAsLLVMConstant(ImplicitLocOpBuilder &b, Type type) {
   // Enum is from CUDA driver API
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
   enum CUtensorMapDataTypeEnum {
@@ -1079,25 +1076,25 @@ static Value elementTypeAsLLVMConstant(RewriterBase &rewriter, Operation *op,
   };
 
   if (type.isUnsignedInteger(8))
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT8);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_UINT8);
   if (type.isUnsignedInteger(16))
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT16);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_UINT16);
   if (type.isUnsignedInteger(32))
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT32);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_UINT32);
   if (type.isUnsignedInteger(64))
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT64);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_UINT64);
   if (type.isSignlessInteger(32))
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_INT32);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_INT32);
   if (type.isSignlessInteger(64))
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_INT64);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_INT64);
   if (type.isF16())
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT16);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_FLOAT16);
   if (type.isF32())
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT32);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_FLOAT32);
   if (type.isF64())
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT64);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_FLOAT64);
   if (type.isBF16())
-    return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16);
+    return makeI64Const(b, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16);
 
   llvm_unreachable("Not supported data type");
 }
@@ -1109,23 +1106,22 @@ struct NVGPUTmaCreateDescriptorOpLowering
   LogicalResult
   matchAndRewrite(nvgpu::TmaCreateDescriptorOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Location loc = op->getLoc();
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     LLVM::LLVMPointerType llvmPointerType = getTypeConverter()->getPointerType(
         IntegerType::get(op->getContext(), 8));
     Type llvmInt64Type = IntegerType::get(op->getContext(), 64);
 
-    Value tensorElementType = elementTypeAsLLVMConstant(
-        rewriter, op, op.getTensor().getType().getElementType());
+    Value tensorElementType =
+        elementTypeAsLLVMConstant(b, op.getTensor().getType().getElementType());
     auto promotedOperands = getTypeConverter()->promoteOperands(
-        loc, op->getOperands(), adaptor.getOperands(), rewriter);
+        b.getLoc(), op->getOperands(), adaptor.getOperands(), b);
 
-    Value boxArrayPtr = rewriter.create<LLVM::AllocaOp>(
-        loc, llvmPointerType, llvmInt64Type, makeI64Const(rewriter, op, 5));
+    Value boxArrayPtr = b.create<LLVM::AllocaOp>(llvmPointerType, llvmInt64Type,
+                                                 makeI64Const(b, 5));
     for (auto [index, value] : llvm::enumerate(adaptor.getBoxDimensions())) {
-      Value gep = rewriter.create<LLVM::GEPOp>(
-          loc, llvmPointerType, llvmPointerType, boxArrayPtr,
-          makeI64Const(rewriter, op, index));
-      rewriter.create<LLVM::StoreOp>(loc, value, gep);
+      Value gep = b.create<LLVM::GEPOp>(llvmPointerType, llvmPointerType,
+                                        boxArrayPtr, makeI64Const(b, index));
+      b.create<LLVM::StoreOp>(value, gep);
     }
 
     nvgpu::TensorMapDescriptorType desc = op.getTensorMap().getType();
@@ -1135,12 +1131,10 @@ struct NVGPUTmaCreateDescriptorOpLowering
     arguments.push_back(promotedOperands[1]); // descriptor
     arguments.push_back(tensorElementType);   // data type
     arguments.push_back(
-        makeI64Const(rewriter, op, (int)desc.getInterleave())); // interleave
-    arguments.push_back(
-        makeI64Const(rewriter, op, (int)desc.getSwizzle())); // swizzle
-    arguments.push_back(
-        makeI64Const(rewriter, op, (int)desc.getL2promo())); // l2promo
-    arguments.push_back(makeI64Const(rewriter, op, (int)desc.getOob())); // oob
+        makeI64Const(b, (int)desc.getInterleave()));              // interleave
+    arguments.push_back(makeI64Const(b, (int)desc.getSwizzle())); // swizzle
+    arguments.push_back(makeI64Const(b, (int)desc.getL2promo())); // l2promo
+    arguments.push_back(makeI64Const(b, (int)desc.getOob()));     // oob
     arguments.push_back(boxArrayPtr); // box dimensions
 
     // Set data types of the arguments
@@ -1157,7 +1151,7 @@ struct NVGPUTmaCreateDescriptorOpLowering
     FunctionCallBuilder hostRegisterCallBuilder = {
         "mgpuTensorMapEncodeTiledMemref", llvmPointerType, argTypes};
     Value tensorMap =
-        hostRegisterCallBuilder.create(loc, rewriter, arguments).getResult();
+        hostRegisterCallBuilder.create(b.getLoc(), b, arguments).getResult();
 
     rewriter.replaceOp(op, tensorMap);
     return success();
@@ -1168,140 +1162,386 @@ struct NVGPUWarpgroupMmaOpLowering
     : public ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaOp> {
   using ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaOp>::ConvertOpToLLVMPattern;
 
-  LogicalResult getWgmmaShape(int64_t sizeM, int64_t sizeN, Type inputElemType,
-                              int &wgmmaShapeM, int &wgmmaShapeN,
-                              int &wgmmaShapeK) const {
-    wgmmaShapeM = 64;
-    wgmmaShapeN = sizeN;
-    if (inputElemType.isTF32()) {
-      wgmmaShapeK = 8;
-    } else if (inputElemType.isF16() || inputElemType.isBF16()) {
-      wgmmaShapeK = 16;
-    } else if (inputElemType.isFloat8E4M3FN() || inputElemType.isFloat8E5M2() ||
-               inputElemType.isInteger(16)) {
-      wgmmaShapeK = 32;
-    } else if (inputElemType.isInteger(1)) {
-      wgmmaShapeK = 256;
-    } else {
-      llvm_unreachable("msg: not supported K shape");
+  /// This is a helper class to generate required NVVM Ops for warp-group level
+  /// matrix multiplication.
+  /// When the given GEMM shape is larger than the shape of
+  /// a wgmma instrution in PTX, it can generate multiple NVVM::WgmmaMmaAsyncOp
+  /// Op(s), group and execute them asynchronously. The class also handles
+  /// waiting for completion and iterates through WarpgroupMatrixDescriptor to
+  /// create descriptors for each instruction.
+  ///
+  /// For example this is the case when the shape of GEMM is 128x128x128
+  ///
+  ///    nvvm.wgmma.fence.aligned
+  ///
+  ///    nvvm.wgmma.mma.async descA, descB
+  ///    iterate(descA, descB)
+  ///    nvvm.wgmma.mma.async descA, descB
+  ///    [6x times more]
+  ///
+  ///    nvvm.wgmma.group.sync.aligned
+  ///    nvvm.wgmma.wait.group.sync [groupId]
+  ///
+  class WarpgroupGemm {
+    nvgpu::WarpgroupMmaOp op;
+    ImplicitLocOpBuilder b;
+    OpAdaptor adaptor;
+    const LLVMTypeConverter &typeConverter;
+
+    // Entire shape of the given Op
+    int64_t totalM, totalN, totalK;
+
+    // Shape of one wgmma instruction
+    int wgmmaM = 0, wgmmaN = 0, wgmmaK = 0;
+
+    // Iteration counts for GEMM
+    int iterationM = 0, iterationN = 0, iterationK = 0;
+
+    /// The function returns the shape of wgmma instruction that is defined in
+    /// PTX programming guide.
+    /// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shape
+    void findWgmmaShape(int64_t sizeM, int64_t sizeN, Type inputElemType) {
+      wgmmaM = 64;
+      wgmmaN = sizeN;
+      if (inputElemType.isTF32()) {
+        wgmmaK = 8;
+      } else if (inputElemType.isF16() || inputElemType.isBF16()) {
+        wgmmaK = 16;
+      } else if (inputElemType.isFloat8E4M3FN() ||
+                 inputElemType.isFloat8E5M2() || inputElemType.isInteger(16)) {
+        wgmmaK = 32;
+      } else if (inputElemType.isInteger(1)) {
+        wgmmaK = 256;
+      } else {
+        llvm_unreachable("msg: not supported K shape");
+      }
+      LLVM_DEBUG(DBGS() << "Generating WgmmaMmaAsyncOp shape[m = " << wgmmaM
+                        << ", n = " << wgmmaN << ", k = " << wgmmaK << "]\n");
     }
-    LLVM_DEBUG(DBGS() << "Generating wgmma.mma.async shape[m = " << wgmmaShapeM
-                      << ", n = " << wgmmaShapeN << ", k = " << wgmmaShapeK
-                      << "]\n");
-    return success();
-  }
-
-  Value generateNVVMWgmmaOp(MLIRContext *ctx,
-                            ConversionPatternRewriter &rewriter, Location loc,
-                            int m, int n, int k, Type resultStructType,
-                            Value inout, Value descriptorA,
-                            Value descriptorB) const {
-    auto shape = NVVM::MMAShapeAttr::get(ctx, m, n, k);
-    auto scaleOut = NVVM::WGMMAScaleOutAttr::get(ctx, NVVM::WGMMAScaleOut::one);
-    auto scaleIn = NVVM::WGMMAScaleInAttr::get(ctx, NVVM::WGMMAScaleIn::one);
-    auto layoutA = NVVM::MMALayoutAttr::get(ctx, NVVM::MMALayout::row);
-    auto layoutB = NVVM::MMALayoutAttr::get(ctx, NVVM::MMALayout::col);
-    // todo: handle other input and output types
-    auto itype = NVVM::WGMMATypesAttr::get(ctx, NVVM::WGMMATypes::f16);
-    auto overflow =
-        NVVM::MMAIntOverflowAttr::get(ctx, NVVM::MMAIntOverflow::wrapped);
-    Value res = rewriter.create<NVVM::WgmmaMmaAsyncOp>(
-        loc, resultStructType, inout, descriptorA, descriptorB, shape, itype,
-        itype, scaleOut, scaleIn, scaleIn, layoutA, layoutB, overflow);
-    return res;
-  }
-
-  LogicalResult
-  matchAndRewrite(nvgpu::WarpgroupMmaOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    int64_t sizeM = op.getDescriptorA().getType().getTensor().getDimSize(0);
-    int64_t sizeN = op.getDescriptorB().getType().getTensor().getDimSize(1);
-    int64_t sizeK = op.getDescriptorA().getType().getTensor().getDimSize(1);
-
-    LLVM_DEBUG(DBGS() << "===--- GEMM D[" << sizeM << "][" << sizeN << "] += A["
-                      << sizeM << "][" << sizeK << "] * B[" << sizeK << "]["
-                      << sizeN << "] ---===\n");
 
-    int wgmmaShapeM, wgmmaShapeN, wgmmaShapeK;
-    if (failed(getWgmmaShape(sizeM, sizeN, rewriter.getF16Type(), wgmmaShapeM,
-                             wgmmaShapeN, wgmmaShapeK))) {
-      return failure();
+    /// Generates WGMMATypesAttr from MLIR Type
+    NVVM::WGMMATypesAttr generateWgmmaType(Type type) const {
+      auto getWgmmaType = [](Type elemType) {
+        if (elemType.isF32() || elemType.isTF32())
+          return NVVM::WGMMATypes::tf32;
+        if (elemType.isF16())
+          return NVVM::WGMMATypes::f16;
+        if (elemType.isBF16())
+          return NVVM::WGMMATypes::bf16;
+        if (elemType.isFloat8E4M3FN())
+          return NVVM::WGMMATypes::e4m3;
+        if (elemType.isFloat8E5M2())
+          return NVVM::WGMMATypes::e5m2;
+        if (elemType.isInteger(1))
+          return NVVM::WGMMATypes::b1;
+        if (elemType.isInteger(8))
+          return NVVM::WGMMATypes::s8;
+        if (elemType.isUnsignedInteger(8))
+          return NVVM::WGMMATypes::u8;
+        llvm_unreachable("unsupported type");
+      };
+      return NVVM::WGMMATypesAttr::get(op->getContext(), getWgmmaType(type));
     }
 
-    Value descriptorA = adaptor.getDescriptorA();
-    Value descriptorB = adaptor.getDescriptorB();
+    /// Generates layout attribute for the input matrix for wgmma instruction
+    NVVM::MMALayoutAttr
+    generateWgmmaLayout(std::optional<bool> transpose) const {
+      if (transpose.value_or(false))
+        return NVVM::MMALayoutAttr::get(op->getContext(), NVVM::MMALayout::col);
+      return NVVM::MMALayoutAttr::get(op->getContext(), NVVM::MMALayout::row);
+    }
 
-    //  Generate wgmma group
+    /// Generates shape attribute for wgmma instruction
+    NVVM::MMAShapeAttr generateWgmmaShape() const {
+      return NVVM::MMAShapeAttr::get(op->getContext(), wgmmaM, wgmmaN, wgmmaK);
+    }
 
-    auto loc = op->getLoc();
-    MemRefType typeTensorA = op.getDescriptorA().getType().getTensor();
-    MemRefType typeTensorB = op.getDescriptorB().getType().getTensor();
+    /// Generates scale attributes of output matrix for wgmma instruction
+    NVVM::WGMMAScaleOutAttr generateScaleOut() const {
+      return NVVM::WGMMAScaleOutAttr::get(op->getContext(),
+                                          NVVM::WGMMAScaleOut::one);
+    }
+    /// Generates scale attributes of input matrix for wgmma instruction
+    NVVM::WGMMAScaleInAttr generateScaleIn() const {
+      return NVVM::WGMMAScaleInAttr::get(op->getContext(),
+                                         NVVM::WGMMAScaleIn::one);
+    }
 
-    auto makeAdd = [&](Value lhs, Value rhs) -> Value {
-      return rewriter.create<LLVM::AddOp>(loc, lhs.getType(), lhs, rhs);
+    /// Basic function to generate Add
+    Value makeAdd(Value lhs, Value rhs) {
+      return b.create<LLVM::AddOp>(lhs.getType(), lhs, rhs);
     };
 
-    auto iterateDescA = [&](Value desc, int iterM, int iterN,
-                            int iterK) -> Value {
-      // todo : Handle column major
-      int byte = typeTensorA.getElementTypeBitWidth() / 8;
-      int tileShapeA = typeTensorA.getDimSize(1);
-      int incrementVal =
-          ((wgmmaShapeK * iterK) + (sizeK * tileShapeA * iterM)) * byte;
+    /// Moves the descriptor pointer of matrix-A for the next wgmma instruction.
+    /// Currently, it only handles row-major.
+    ///
+    /// It moves the pointer like below for [128][64] size:
+    ///                 +2 +4 +6
+    ///                  ↓  ↓  ↓
+    /// descA    ---> +--+--+--+--+
+    ///               |->|->|->|->|
+    ///               |  |  |  |  |
+    ///               |  |  |  |  |
+    ///               |  |  |  |  |
+    /// descA+512---> +-----------+
+    ///               |  |  |  |  |
+    ///               |  |  |  |  |
+    ///               |  |  |  |  |
+    ///               |  |  |  |  |
+    ///               +-----------+
+    ///
+    Value iterateDescriptorA(Value desc, int i, int j, int k) {
+      MemRefType matrixTypeA = op.getDescriptorA().getType().getTensor();
+      Type elemA = matrixTypeA.getElementType();
+      int byte = elemA.getIntOrFloatBitWidth() / 8;
+      int tileShapeA = matrixTypeA.getDimSize(1);
+      int incrementVal = ((wgmmaK * k) + (totalK * tileShapeA * i)) * byte;
       incrementVal = incrementVal >> exclude4LSB;
-      LLVM_DEBUG(DBGS() << "\t\t[m: " << iterM << " n: " << iterN << " k: "
-                        << iterK << "] [wgmma descriptors] Descriptor A + "
+      LLVM_DEBUG(DBGS() << "\t\t[m: " << i << " n: " << j << " k: " << k
+                        << "] [wgmma descriptors] Descriptor A + "
                         << incrementVal << " | \t ");
       if (!incrementVal)
         return desc;
-      return makeAdd(desc, makeI64Const(rewriter, op, incrementVal));
-    };
+      return makeAdd(desc, makeI64Const(b, incrementVal));
+    }
 
-    auto iterateDescB = [&](Value desc, int iterM, int iterN,
-                            int iterK) -> Value {
-      // todo : Handle row major
-      int byte = typeTensorB.getElementTypeBitWidth() / 8;
-      int incrementVal = typeTensorB.getDimSize(0) * wgmmaShapeK * iterK * byte;
+    /// Moves the descriptor pointer of matrix-B for the next wgmma instruction.
+    /// Currently, it only handles column-major.
+    ///
+    /// It moves the pointer like below for [128][64] size:
+    /// descB     ---> +--+--+--+--+--+--+--+--+
+    ///                |↓ |  |  |  |  |  |  |  |
+    ///                |↓ |  |  |  |  |  |  |  |
+    ///                |↓ |  |  |  |  |  |  |  |
+    ///                |↓ |  |  |  |  |  |  |  |
+    ///                +--+--+--+--+--+--+--+--+
+    ///
+    Value iterateDescriptorB(Value desc, int i, int j, int k) {
+      MemRefType matrixTypeB = op.getDescriptorB().getType().getTensor();
+      Type elemB = matrixTypeB.getElementType();
+      int byte = elemB.getIntOrFloatBitWidth() / 8;
+      int incrementVal = matrixTypeB.getDimSize(0) * wgmmaK * k * byte;
       incrementVal = incrementVal >> exclude4LSB;
       LLVM_DEBUG(DBGSE() << "Descriptor B + " << incrementVal << "\n");
       if (!incrementVal)
         return desc;
-      return makeAdd(desc, makeI64Const(rewriter, op, incrementVal));
+      return makeAdd(desc, makeI64Const(b, incrementVal));
+    }
+
+    /// This function generates a WgmmaMmaAsyncOp using provided GMMA matrix
+    /// descriptors and arranges them based on induction variables: i, j, and k.
+    Value generateWgmma(int i, int j, int k, Value matrixC, Value matrixD) {
+      LLVM_DEBUG(DBGS() << "\t wgmma."
+                        << "m" << wgmmaM << "n" << wgmmaN << "k" << wgmmaK
+                        << "(A[" << (iterationM * wgmmaM) << ":"
+                        << (iterationM * wgmmaM) + wgmmaM << "]["
+                        << (iterationK * wgmmaK) << ":"
+                        << (iterationK * wgmmaK + wgmmaK) << "] * "
+                        << " B[" << (iterationK * wgmmaK) << ":"
+                        << (iterationK * wgmmaK + wgmmaK) << "][" << 0 << ":"
+                        << wgmmaN << "])\n");
+
+      Value descriptorA = iterateDescriptorA(adaptor.getDescriptorA(), i, j, k);
+      Value descriptorB = iterateDescriptorB(adaptor.getDescriptorB(), i, j, k);
+
+      Type elemA = op.getDescriptorA().getType().getTensor().getElementType();
+      NVVM::WGMMATypesAttr itypeA = generateWgmmaType(elemA);
+
+      Type elemB = op.getDescriptorB().getType().getTensor().getElementType();
+      NVVM::WGMMATypesAttr itypeB = generateWgmmaType(elemB);
+
+      NVVM::MMAShapeAttr shape = generateWgmmaShape();
+      NVVM::WGMMAScaleOutAttr scaleOut = generateScaleOut();
+      NVVM::WGMMAScaleInAttr scaleIn = generateScaleIn();
+      NVVM::MMALayoutAttr layoutA = generateWgmmaLayout(op.getTransposeA());
+      NVVM::MMALayoutAttr layoutB = generateWgmmaLayout(op.getTransposeB());
+
+      auto overflow = NVVM::MMAIntOverflowAttr::get(
+          op->getContext(), NVVM::MMAIntOverflow::wrapped);
+
+      Type resultStructType = typeConverter.convertType(matrixD.getType());
+
+      return b.create<NVVM::WgmmaMmaAsyncOp>(
+          resultStructType, matrixC, descriptorA, descriptorB, shape, itypeA,
+          itypeB, scaleOut, scaleIn, scaleIn, layoutA, layoutB, overflow);
+    }
+
+    /// Generates multiple wgmma instructions to complete the given GEMM shape
+    SmallVector<Value> generateWgmmaGroup() {
+      SmallVector<Value> wgmmaResults;
+
+      // Perform GEMM
+      for (int i = 0; i < iterationM; ++i) {
+        Value matrixC = adaptor.getMatrixC()[i];
+        Value matrixD = op.getMatrixD()[i];
+        for (int j = 0; j < iterationN; ++j)
+          for (int k = 0; k < iterationK; ++k)
+            matrixC = generateWgmma(i, j, k, matrixC, matrixD);
+        wgmmaResults.push_back(matrixC);
+      }
+
+      return wgmmaResults;
+    }
+
+  public:
+    WarpgroupGemm(nvgpu::WarpgroupMmaOp op, ImplicitLocOpBuilder &b,
+                  OpAdaptor adaptor, const LLVMTypeConverter &typeConverter)
+        : op(op), b(b), adaptor(adaptor), typeConverter(typeConverter) {
+      // Find the entire GEMM Shape
+      totalM = op.getDescriptorA().getType().getTensor().getDimSize(0);
+      totalN = op.getDescriptorB().getType().getTensor().getDimSize(1);
+      totalK = op.getDescriptorA().getType().getTensor().getDimSize(1);
+      LLVM_DEBUG(DBGS() << "===--- GEMM D[" << totalM << "][" << totalN
+                        << "] += A[" << totalM << "][" << totalK << "] * B["
+                        << totalK << "][" << totalN << "] ---===\n");
+
+      // Find the shape for one wgmma instruction
+      findWgmmaShape(
+          totalM, totalN,
+          op.getDescriptorA().getType().getTensor().getElementType());
+
+      // Iterations counts to complete the given shape with wgmma shape
+      iterationM = totalM / wgmmaM;
+      iterationN = totalN / wgmmaN;
+      iterationK = totalK / wgmmaK;
+    }
+
+    /// Generates WgmmaMmaAsync Ops to complete the specified GEMM  shape. It
+    /// includes generating a fence Op (WgmmaFenceAlignedOp) before the
+    /// instructions and group synchronization, as well as waiting
+    /// (WgmmaGroupSyncAlignedOp) for group synchronization
+    /// (WgmmaWaitGroupSyncOp) after the instructions.
+    SmallVector<Value> generateWarpgroupMma() {
+      b.create<NVVM::WgmmaFenceAlignedOp>();
+      SmallVector<Value> wgmmaResults = generateWgmmaGroup();
+      b.create<NVVM::WgmmaGroupSyncAlignedOp>();
+      b.create<NVVM::WgmmaWaitGroupSyncOp>(op.getWaitGroup());
+      return wgmmaResults;
+    }
+  };
+
+  LogicalResult
+  matchAndRewrite(nvgpu::WarpgroupMmaOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+    // Step 1. Build a helper class
+    WarpgroupGemm warpgroupGemm(op, b, adaptor, *this->getTypeConverter());
+
+    // Step 2. Get the entire GEMM Shape
+    SmallVector<Value> wgmmaResults = warpgroupGemm.generateWarpgroupMma();
+
+    // Step 3. Replace fragmented result struct with the op results
+    rewriter.replaceOp(op, wgmmaResults);
+    return success();
+  }
+};
+
+struct NVGPUWarpgroupMmaStoreOpLowering
+    : public ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaStoreOp> {
+  using ConvertOpToLLVMPattern<
+      nvgpu::WarpgroupMmaStoreOp>::ConvertOpToLLVMPattern;
+
+  /// This function stores a fragmented register matrix owned by a warp group
+  /// (128 threads) into a memref. Each thread has 64 registers, each the size
+  /// of a struct.
+  /// Here is what each threads (T) holds, each `d` is struct value with a
+  /// number.
+  ///
+  /// Threads in warp-group (128 threads) and what they owns in the matrixD:
+  /// 0-31 	  Warp-0  -> MatrixD[0:15 ][0:N]
+  /// 32-63 	Warp-1  -> MatrixD[16:31][0:N]
+  /// 64-95 	Warp-2  -> MatrixD[32:47][0:N]
+  /// 96-127 	Warp-3  -> MatrixD[48:64][0:N]
+  ///
+  /// Matrix-D:
+  ///   +______________________________________________________________________+
+  ///   |     0-1  |    2-3  |    4-5  |    6-7  |   8-9  |   10-11|..|N-8,N-7 |
+  /// 0 | T0:d0-d1 |T1:d0-d1 |T2:d0-d1 |T3:d0-d1 |T0:d4-d5| T1:d4-d5..|T0:dX-dY|
+  /// 1 | T4:d0-d1 |T5:d0-d1 |T6:d0-d1 |T7:d0-d1 |T4:d4-d5| T5:d4-d5..|T4:dX-dY|
+  /// ..| .........|.........|.........|.........|........|...........|........|
+  /// 8 | T0:d2-d3 |T1:d2-d3 |T2:d2-d3 |T3:d2-d3 |T0:d6-d7|T1:d6-d7,..|T0:dZ-dW|
+  /// 9 | T4:d2-d3 |T5:d2-d3 |T6:d2-d3 |T7:d2-d3 |T4:d6-d7| T5:d6-d7..|T4:dZ-dW|
+  /// ..| .........|.........|.........|.........|........|...........|........|
+  /// 15| T28:d2-d3|T29:d2-d3|T30:d2-d3|T31:d2-d3|........|...........|........|
+  /// 16| T32:d2-d3|T33:d2-d3|T34:d2-d3|T35:d2-d3|........|...........|........|
+  /// ..| .........|.........|.........|.........|........|...........|........|
+  /// 32| T64:d2-d3|T65:d2-d3|T66:d2-d3|T67:d2-d3|........|...........|........|
+  /// ..| .........|.........|.........|.........|........|...........|........|
+  /// 48| T96:d2-d3|T97:d2-d3|T98:d2-d3|T99:d2-d3|........|...........|........|
+  /// ..| .........|.........|.........|.........|........|...........|........|
+  ///   +______________________________________________________________________+
+  ///
+  /// \param rewriter: The pattern rewriter.
+  /// \param matrixD: Result of the warp-group MMA operation (fragmented
+  /// matrix). It is holded by a thread and a struct with 64 elements.
+  /// \param dstMemref: The memref where the registers will be stored.
+  /// \param offset: the offset within the memref where the registers will be
+  /// stored.
+  void storeFragmentedMatrix(ImplicitLocOpBuilder &b, Value matrixD,
+                             TypedValue<MemRefType> dstMemref,
+                             int offset) const {
+    Type i32 = b.getI32Type();
+
+    auto makeConst = [&](int32_t index) -> Value {
+      return b.create<LLVM::ConstantOp>(i32, b.getI32IntegerAttr(index));
+    };
+    Value c1 = makeConst(1);
+    Value c2 = makeConst(2);
+    Value c4 = makeConst(4);
+    Value c8 = makeConst(8);
+    Value c16 = makeConst(16);
+    Value warpSize = makeConst(kWarpSize);
+
+    auto makeMul = [&](Value lhs, Value rhs) -> Value {
+      return b.create<LLVM::MulOp>(lhs.getType(), lhs, rhs);
+    };
+    auto makeAdd = [&](Value lhs, Value rhs) -> Value {
+      return b.create<LLVM::AddOp>(lhs.getType(), lhs, rhs);
     };
 
-    rewriter.create<NVVM::WgmmaFenceAlignedOp>(loc);
-
-    SmallVector<Value> wgmmaResults;
-    for (int iterM = 0; iterM < (sizeM / wgmmaShapeM); iterM++) {
-      Value matrixC = adaptor.getMatrixC()[iterM];
-      Value matrixD = op.getMatrixD()[iterM];
-      Type structType = getTypeConverter()->convertType(matrixD.getType());
-      LLVM_DEBUG(DBGS() << " D[" << (iterM * wgmmaShapeM) << ":"
-                        << (iterM * wgmmaShapeM) + wgmmaShapeM << "][" << 0
-                        << ":" << wgmmaShapeN << "] += \n");
-      for (int iterK = 0; iterK < (sizeK / wgmmaShapeK); iterK++) {
-        Value descA = iterateDescA(descriptorA, iterM, 0, iterK);
-        Value descB = iterateDescB(descriptorB, iterM, 0, iterK);
-        LLVM_DEBUG(DBGS() << "\t wgmma."
-                          << "m" << wgmmaShapeM << "n" << wgmmaShapeN << "k"
-                          << wgmmaShapeK << "(A[" << (iterM * wgmmaShapeM)
-                          << ":" << (iterM * wgmmaShapeM) + wgmmaShapeM << "]["
-                          << (iterK * wgmmaShapeK) << ":"
-                          << (iterK * wgmmaShapeK + wgmmaShapeK) << "] * "
-                          << " B[" << (iterK * wgmmaShapeK) << ":"
-                          << (iterK * wgmmaShapeK + wgmmaShapeK) << "][" << 0
-                          << ":" << wgmmaShapeN << "])\n");
-        matrixC = generateNVVMWgmmaOp(op->getContext(), rewriter, loc,
-                                      wgmmaShapeM, wgmmaShapeN, wgmmaShapeK,
-                                      structType, matrixC, descA, descB);
+    Value tidx = b.create<NVVM::ThreadIdXOp>(i32);
+    Value laneId = b.create<LLVM::URemOp>(i32, tidx, warpSize);
+    Value warpId = b.create<LLVM::UDivOp>(i32, tidx, warpSize);
+    Value lane4Id = b.create<LLVM::UDivOp>(i32, laneId, c4);
+    Value lane4modId = b.create<LLVM::URemOp>(i32, laneId, c4);
+
+    auto makeExtractAndStore = [&](int i, Value wgmmaResult, Value x, Value y,
+                                   TypedValue<::mlir::MemRefType> memref) {
+      Type it = b.getIndexType();
+      Value idx = b.create<arith::IndexCastOp>(it, x);
+      Value idy0 = b.create<arith::IndexCastOp>(it, y);
+      Value idy1 = b.create<arith::IndexCastOp>(it, makeAdd(y, c1));
+      Value d0 = b.create<LLVM::ExtractValueOp>(wgmmaResult, i);
+      Value d1 = b.create<LLVM::ExtractValueOp>(wgmmaResult, i + 1);
+      b.create<memref::StoreOp>(d0, memref, ValueRange{idx, idy0});
+      b.create<memref::StoreOp>(d1, memref, ValueRange{idx, idy1});
+    };
+
+    Value tj = makeMul(lane4modId, c2);
+    Value ti = makeAdd(lane4Id, makeMul(warpId, c16));
+    if (offset)
+      ti = makeAdd(ti, makeConst(offset));
+    for (int i = 0; i < 2; ++i) {
+      Value idx = makeAdd(ti, makeMul(makeConst(i), c8));
+      for (int j = 0; j < 16; ++j) {
+        Value idy = makeAdd(tj, makeMul(makeConst(j), c8));
+        int sIndex = i * 2 + j * 4;
+        makeExtractAndStore(sIndex, matrixD, idx, idy, dstMemref);
       }
-      wgmmaResults.push_back(matrixC);
     }
-    rewriter.create<NVVM::WgmmaGroupSyncAlignedOp>(loc);
-    rewriter.create<NVVM::WgmmaWaitGroupSyncOp>(loc, op.getWaitGroup());
+  }
 
-    ValueRange myres(wgmmaResults);
-    rewriter.replaceOp(op, myres);
+  LogicalResult
+  matchAndRewrite(nvgpu::WarpgroupMmaStoreOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    int offset = 0;
+    ImplicitLocOpBuilder lb(op->getLoc(), rewriter);
+    for (Value matrixD : adaptor.getMatrixD()) {
+      auto structType = matrixD.getType().cast<LLVM::LLVMStructType>();
+      storeFragmentedMatrix(lb, matrixD, op.getDstMemref(), offset);
+      offset += structType.getBody().size();
+    }
+    rewriter.eraseOp(op);
     return success();
   }
 };
@@ -1320,8 +1560,9 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
       NVGPUTmaAsyncLoadOpLowering,           // nvgpu.tma.async.load
       NVGPUTmaCreateDescriptorOpLowering,    // nvgpu.tma.create.descriptor
       NVGPUMBarrierArriveExpectTxLowering,   // nvgpu.mbarrier.arrive.expect_tx
-      NVGPUGenerateGmmaDescriptorLowering,   // nvgpu.wgmma.generate.descriptor
-      NVGPUWarpgroupMmaOpLowering,           // nvgpu.warpgroup.mma
+      NVGPUGenerateWarpgroupDescriptorLowering, // nvgpu.warpgroup.generate.descriptor
+      NVGPUWarpgroupMmaOpLowering,              // nvgpu.warpgroup.mma
+      NVGPUWarpgroupMmaStoreOpLowering,         // nvgpu.warpgroup.mma.store
       MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
       NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
       NVGPUMmaSparseSyncLowering>(converter);
diff --git a/mlir/lib/Debug/CLOptionsSetup.cpp b/mlir/lib/Debug/CLOptionsSetup.cpp
index 24d0bd8808261d..76330b2a0a92f1 100644
--- a/mlir/lib/Debug/CLOptionsSetup.cpp
+++ b/mlir/lib/Debug/CLOptionsSetup.cpp
@@ -12,6 +12,7 @@
 #include "mlir/Debug/DebuggerExecutionContextHook.h"
 #include "mlir/Debug/ExecutionContext.h"
 #include "mlir/Debug/Observers/ActionLogging.h"
+#include "mlir/Debug/Observers/ActionProfiler.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/Support/CommandLine.h"
@@ -30,6 +31,12 @@ struct DebugConfigCLOptions : public DebugConfig {
                  " '-' is passed"),
         cl::location(logActionsToFlag)};
 
+    static cl::opt<std::string, /*ExternalStorage=*/true> profileActionsTo{
+        "profile-actions-to",
+        cl::desc("Profile action execution to a file, or stderr if "
+                 " '-' is passed"),
+        cl::location(profileActionsToFlag)};
+
     static cl::list<std::string> logActionLocationFilter(
         "log-mlir-actions-filter",
         cl::desc(
@@ -71,6 +78,7 @@ class InstallDebugHandler::Impl {
 public:
   Impl(MLIRContext &context, const DebugConfig &config) {
     if (config.getLogActionsTo().empty() &&
+        config.getProfileActionsTo().empty() &&
         !config.isDebuggerActionHookEnabled()) {
       if (tracing::DebugCounter::isActivated())
         context.registerActionHandler(tracing::DebugCounter());
@@ -97,6 +105,24 @@ class InstallDebugHandler::Impl {
         actionLogger->addBreakpointManager(locationBreakpoint);
       executionContext.registerObserver(actionLogger.get());
     }
+
+    if (!config.getProfileActionsTo().empty()) {
+      std::string errorMessage;
+      profileActionsFile =
+          openOutputFile(config.getProfileActionsTo(), &errorMessage);
+      if (!profileActionsFile) {
+        emitError(UnknownLoc::get(&context),
+                  "Opening file for --profile-actions-to failed: ")
+            << errorMessage << "\n";
+        return;
+      }
+      profileActionsFile->keep();
+      raw_fd_ostream &profileActionsStream = profileActionsFile->os();
+      actionProfiler =
+          std::make_unique<tracing::ActionProfiler>(profileActionsStream);
+      executionContext.registerObserver(actionProfiler.get());
+    }
+
     if (config.isDebuggerActionHookEnabled()) {
       errs() << " (with Debugger hook)";
       setupDebuggerExecutionContextHook(executionContext);
@@ -111,6 +137,8 @@ class InstallDebugHandler::Impl {
   std::unique_ptr<tracing::ActionLogger> actionLogger;
   std::vector<std::unique_ptr<tracing::FileLineColLocBreakpoint>>
       locationBreakpoints;
+  std::unique_ptr<ToolOutputFile> profileActionsFile;
+  std::unique_ptr<tracing::ActionProfiler> actionProfiler;
 };
 
 InstallDebugHandler::InstallDebugHandler(MLIRContext &context,
diff --git a/mlir/lib/Debug/Observers/ActionProfiler.cpp b/mlir/lib/Debug/Observers/ActionProfiler.cpp
new file mode 100644
index 00000000000000..07bf9fd0ccc7db
--- /dev/null
+++ b/mlir/lib/Debug/Observers/ActionProfiler.cpp
@@ -0,0 +1,63 @@
+//===- ActionProfiler.cpp -  Profiling Actions *- C++ -*-=====================//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Debug/Observers/ActionProfiler.h"
+#include "mlir/Debug/BreakpointManager.h"
+#include "mlir/IR/Action.h"
+#include "mlir/Rewrite/PatternApplicator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/raw_ostream.h"
+#include <chrono>
+
+using namespace mlir;
+using namespace mlir::tracing;
+
+//===----------------------------------------------------------------------===//
+// ActionProfiler
+//===----------------------------------------------------------------------===//
+void ActionProfiler::beforeExecute(const ActionActiveStack *action,
+                                   Breakpoint *breakpoint, bool willExecute) {
+  print(action, "B"); // begin event.
+}
+
+void ActionProfiler::afterExecute(const ActionActiveStack *action) {
+  print(action, "E"); // end event.
+}
+
+// Print an event in JSON format.
+void ActionProfiler::print(const ActionActiveStack *action,
+                           llvm::StringRef phase) {
+  // Create the event.
+  std::string str;
+  llvm::raw_string_ostream event(str);
+  event << "{";
+  event << R"("name": ")" << action->getAction().getTag() << "\", ";
+  event << R"("cat": "PERF", )";
+  event << R"("ph": ")" << phase << "\", ";
+  event << R"("pid": 0, )";
+  event << R"("tid": )" << llvm::get_threadid() << ", ";
+  auto ts = std::chrono::steady_clock::now() - startTime;
+  event << R"("ts": )"
+        << std::chrono::duration_cast<std::chrono::microseconds>(ts).count();
+  if (phase == "B") {
+    event << R"(, "args": {)";
+    event << R"("desc": ")";
+    action->getAction().print(event);
+    event << "\"}";
+  }
+  event << "}";
+
+  // Print the event.
+  std::lock_guard<std::mutex> guard(mutex);
+  if (printComma)
+    os << ",\n";
+  printComma = true;
+  os << event.str();
+  os.flush();
+}
diff --git a/mlir/lib/Debug/Observers/CMakeLists.txt b/mlir/lib/Debug/Observers/CMakeLists.txt
index 0a4261b0e45cc1..78d8038f6f1265 100644
--- a/mlir/lib/Debug/Observers/CMakeLists.txt
+++ b/mlir/lib/Debug/Observers/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(MLIRObservers
   ActionLogging.cpp
+  ActionProfiler.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Debug/Observers
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index c61bd566c7676f..113f4cfc31c104 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -3915,6 +3915,49 @@ void AffineParallelOp::setSteps(ArrayRef<int64_t> newSteps) {
   setStepsAttr(getBodyBuilder().getI64ArrayAttr(newSteps));
 }
 
+// check whether resultType match op or not in affine.parallel
+static bool isResultTypeMatchAtomicRMWKind(Type resultType,
+                                           arith::AtomicRMWKind op) {
+  switch (op) {
+  case arith::AtomicRMWKind::addf:
+    return isa<FloatType>(resultType);
+  case arith::AtomicRMWKind::addi:
+    return isa<IntegerType>(resultType);
+  case arith::AtomicRMWKind::assign:
+    return true;
+  case arith::AtomicRMWKind::mulf:
+    return isa<FloatType>(resultType);
+  case arith::AtomicRMWKind::muli:
+    return isa<IntegerType>(resultType);
+  case arith::AtomicRMWKind::maximumf:
+    return isa<FloatType>(resultType);
+  case arith::AtomicRMWKind::minimumf:
+    return isa<FloatType>(resultType);
+  case arith::AtomicRMWKind::maxs: {
+    auto intType = llvm::dyn_cast<IntegerType>(resultType);
+    return intType && intType.isSigned();
+  }
+  case arith::AtomicRMWKind::mins: {
+    auto intType = llvm::dyn_cast<IntegerType>(resultType);
+    return intType && intType.isSigned();
+  }
+  case arith::AtomicRMWKind::maxu: {
+    auto intType = llvm::dyn_cast<IntegerType>(resultType);
+    return intType && intType.isUnsigned();
+  }
+  case arith::AtomicRMWKind::minu: {
+    auto intType = llvm::dyn_cast<IntegerType>(resultType);
+    return intType && intType.isUnsigned();
+  }
+  case arith::AtomicRMWKind::ori:
+    return isa<IntegerType>(resultType);
+  case arith::AtomicRMWKind::andi:
+    return isa<IntegerType>(resultType);
+  default:
+    return false;
+  }
+}
+
 LogicalResult AffineParallelOp::verify() {
   auto numDims = getNumDims();
   if (getLowerBoundsGroups().getNumElements() != numDims ||
@@ -3946,11 +3989,16 @@ LogicalResult AffineParallelOp::verify() {
   if (getReductions().size() != getNumResults())
     return emitOpError("a reduction must be specified for each output");
 
-  // Verify reduction  ops are all valid
-  for (Attribute attr : getReductions()) {
+  // Verify reduction ops are all valid and each result type matches reduction
+  // ops
+  for (auto it : llvm::enumerate((getReductions()))) {
+    Attribute attr = it.value();
     auto intAttr = llvm::dyn_cast<IntegerAttr>(attr);
     if (!intAttr || !arith::symbolizeAtomicRMWKind(intAttr.getInt()))
       return emitOpError("invalid reduction attribute");
+    auto kind = arith::symbolizeAtomicRMWKind(intAttr.getInt()).value();
+    if (!isResultTypeMatchAtomicRMWKind(getResult(it.index()).getType(), kind))
+      return emitOpError("result type cannot match reduction attribute");
   }
 
   // Verify that the bound operands are valid dimension/symbols.
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index f07ed626a91638..6b7a157925fae1 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1732,6 +1732,11 @@ void Vectorize::runOnOperation() {
     return signalPassFailure();
   }
 
+  if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
+    f.emitError("Vectorization factor must be greater than zero.");
+    return signalPassFailure();
+  }
+
   DenseSet<Operation *> parallelLoops;
   ReductionLoopMap reductionLoops;
 
diff --git a/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp
index f2a0f45acad6f9..9a066756f429ca 100644
--- a/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp
@@ -17,7 +17,7 @@
 
 namespace mlir {
 namespace arith {
-#define GEN_PASS_DEF_ARITHBUFFERIZE
+#define GEN_PASS_DEF_ARITHBUFFERIZEPASS
 #include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
 } // namespace arith
 } // namespace mlir
@@ -28,7 +28,9 @@ using namespace bufferization;
 namespace {
 /// Pass to bufferize Arith ops.
 struct ArithBufferizePass
-    : public arith::impl::ArithBufferizeBase<ArithBufferizePass> {
+    : public arith::impl::ArithBufferizePassBase<ArithBufferizePass> {
+  using ArithBufferizePassBase::ArithBufferizePassBase;
+
   ArithBufferizePass(uint64_t alignment = 0, bool constantOpOnly = false)
       : constantOpOnly(constantOpOnly) {
     this->alignment = alignment;
@@ -58,10 +60,6 @@ struct ArithBufferizePass
 };
 } // namespace
 
-std::unique_ptr<Pass> mlir::arith::createArithBufferizePass() {
-  return std::make_unique<ArithBufferizePass>();
-}
-
 std::unique_ptr<Pass>
 mlir::arith::createConstantBufferizePass(uint64_t alignment) {
   return std::make_unique<ArithBufferizePass>(alignment,
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index b8630da2ea8330..42a63316b31c6b 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -16,7 +16,7 @@
 
 namespace mlir {
 namespace arith {
-#define GEN_PASS_DEF_ARITHEXPANDOPS
+#define GEN_PASS_DEF_ARITHEXPANDOPSPASS
 #include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
 } // namespace arith
 } // namespace mlir
@@ -303,11 +303,8 @@ struct BFloat16TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
 };
 
 struct ArithExpandOpsPass
-    : public arith::impl::ArithExpandOpsBase<ArithExpandOpsPass> {
-  ArithExpandOpsPass() = default;
-  ArithExpandOpsPass(const arith::ArithExpandOpsOptions& options) {
-    this->includeBf16 = options.includeBf16;
-  }
+    : public arith::impl::ArithExpandOpsPassBase<ArithExpandOpsPass> {
+  using ArithExpandOpsPassBase::ArithExpandOpsPassBase;
 
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
@@ -372,12 +369,3 @@ void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
    >(patterns.getContext());
   // clang-format on
 }
-
-std::unique_ptr<Pass> mlir::arith::createArithExpandOpsPass() {
-  return std::make_unique<ArithExpandOpsPass>();
-}
-
-std::unique_ptr<Pass> mlir::arith::createArithExpandOpsPass(
-  const ArithExpandOpsOptions& options) {
-  return std::make_unique<ArithExpandOpsPass>(options);
-}
diff --git a/mlir/lib/Dialect/ArmSME/IR/ArmSME.cpp b/mlir/lib/Dialect/ArmSME/IR/ArmSME.cpp
index 101cb750f4a6f3..9df15420b9c9b6 100644
--- a/mlir/lib/Dialect/ArmSME/IR/ArmSME.cpp
+++ b/mlir/lib/Dialect/ArmSME/IR/ArmSME.cpp
@@ -29,7 +29,10 @@ using namespace mlir::arm_sme;
 #include "mlir/Dialect/ArmSME/IR/ArmSMEEnums.cpp.inc"
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/ArmSME/IR/ArmSME.cpp.inc"
+#include "mlir/Dialect/ArmSME/IR/ArmSMEOps.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.cpp.inc"
 
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/ArmSME/IR/ArmSMETypes.cpp.inc"
@@ -45,7 +48,10 @@ void ArmSMEDialect::initialize() {
 
   addOperations<
 #define GET_OP_LIST
-#include "mlir/Dialect/ArmSME/IR/ArmSME.cpp.inc"
+#include "mlir/Dialect/ArmSME/IR/ArmSMEOps.cpp.inc"
+      ,
+#define GET_OP_LIST
+#include "mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.cpp.inc"
       >();
 }
 
diff --git a/mlir/lib/Dialect/ArmSME/IR/CMakeLists.txt b/mlir/lib/Dialect/ArmSME/IR/CMakeLists.txt
index 50cfd9bf2f2781..3e448ec4fb1e04 100644
--- a/mlir/lib/Dialect/ArmSME/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSME/IR/CMakeLists.txt
@@ -5,8 +5,8 @@ add_mlir_dialect_library(MLIRArmSMEDialect
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/ArmSME
 
   DEPENDS
-  MLIRArmSMEIncGen
-  MLIRArmSMEAttrDefsIncGen
+  MLIRArmSMEOpsIncGen
+  MLIRArmSMEIntrinsicOpsIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
index e75e958e18a2cf..5e13707ea0aa2b 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
@@ -550,6 +550,118 @@ struct VectorOuterProductToArmSMELowering
   }
 };
 
+/// Lower `vector.extract` using `arm_sme.move_tile_slice_to_vector`.
+///
+/// Example:
+/// ```
+/// %el = vector.extract %tile[%row, %col]: i32 from vector<[4]x[4]xi32>
+/// ```
+/// Becomes:
+/// ```
+/// %slice = arm_sme.move_tile_slice_to_vector %tile[%row]
+///            : vector<[4]xi32> from vector<[4]x[4]xi32>
+/// %el = vector.extract %slice[%col] : i32 from vector<[4]xi32>
+/// ```
+struct VectorExtractToArmSMELowering
+    : public ConvertOpToLLVMPattern<vector::ExtractOp> {
+  using ConvertOpToLLVMPattern<vector::ExtractOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::ExtractOp extractOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    VectorType sourceType = extractOp.getSourceVectorType();
+    if (!isValidSMETileVectorType(sourceType))
+      return failure();
+
+    auto loc = extractOp.getLoc();
+    auto position = extractOp.getMixedPosition();
+
+    Value sourceVector = extractOp.getVector();
+
+    // Extract entire vector. Should be handled by folder, but just to be safe.
+    if (position.empty()) {
+      rewriter.replaceOp(extractOp, sourceVector);
+      return success();
+    }
+
+    Value sliceIndex = vector::getAsValues(rewriter, loc, position[0]).front();
+    auto moveTileSliceToVector =
+        rewriter.create<arm_sme::MoveTileSliceToVectorOp>(loc, sourceVector,
+                                                          sliceIndex);
+
+    if (position.size() == 1) {
+      // Single index case: Extracts a 1D slice.
+      rewriter.replaceOp(extractOp, moveTileSliceToVector);
+      return success();
+    }
+
+    // Two indices case: Extracts a single element.
+    assert(position.size() == 2);
+    rewriter.replaceOpWithNewOp<vector::ExtractOp>(
+        extractOp, moveTileSliceToVector, position[1]);
+
+    return success();
+  }
+};
+
+/// Lower `vector.insert` using `arm_sme.move_vector_to_tile_slice` and
+/// `arm_sme.move_tile_slice_to_vector`.
+///
+/// Example:
+/// ```
+/// %new_tile = vector.insert %el, %tile[%row, %col]
+///                     : i32 into vector<[4]x[4]xi32>
+/// ```
+/// Becomes:
+/// ```
+/// %slice = arm_sme.move_tile_slice_to_vector %tile[%row]
+///            : vector<[4]xi32> from vector<[4]x[4]xi32>
+/// %new_slice = vector.insert %el, %slice[%col] : i32 into vector<[4]xi32>
+/// %new_tile = arm_sme.move_vector_to_tile_slice %new_slice, %tile, %row
+///               : vector<[4]xi32> into vector<[4]x[4]xi32>
+/// ```
+struct VectorInsertToArmSMELowering
+    : public ConvertOpToLLVMPattern<vector::InsertOp> {
+  using ConvertOpToLLVMPattern<vector::InsertOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::InsertOp insertOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    VectorType resultType = insertOp.getResult().getType();
+
+    if (!isValidSMETileVectorType(resultType))
+      return failure();
+
+    auto loc = insertOp.getLoc();
+    auto position = insertOp.getMixedPosition();
+
+    Value source = adaptor.getSource();
+
+    // Overwrite entire vector with value. Should be handled by folder, but
+    // just to be safe.
+    if (position.empty()) {
+      rewriter.replaceOp(insertOp, source);
+      return success();
+    }
+
+    Value tileSlice = source;
+    Value sliceIndex = vector::getAsValues(rewriter, loc, position[0]).front();
+    if (position.size() == 2) {
+      // Two indices case: Insert single element into tile.
+      // We need to first extract the existing slice and update the element.
+      tileSlice = rewriter.create<arm_sme::MoveTileSliceToVectorOp>(
+          loc, adaptor.getDest(), sliceIndex);
+      tileSlice = rewriter.create<vector::InsertOp>(loc, source, tileSlice,
+                                                    position[1]);
+    }
+
+    // Insert the slice into the destination tile.
+    rewriter.replaceOpWithNewOp<arm_sme::MoveVectorToTileSliceOp>(
+        insertOp, tileSlice, adaptor.getDest(), sliceIndex);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::configureArmSMELegalizeForExportTarget(
@@ -604,5 +716,6 @@ void mlir::populateArmSMELegalizeForLLVMExportPatterns(
   patterns.add<
       LoadTileSliceToArmSMELowering, MoveTileSliceToVectorArmSMELowering,
       MoveVectorToTileSliceToArmSMELowering, StoreTileSliceToArmSMELowering,
-      VectorOuterProductToArmSMELowering, ZeroOpConversion>(converter);
+      VectorOuterProductToArmSMELowering, ZeroOpConversion,
+      VectorExtractToArmSMELowering, VectorInsertToArmSMELowering>(converter);
 }
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 7c6c1be351cced..01cbacc96fd42d 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -537,18 +537,18 @@ LogicalResult DeallocTensorOp::bufferize(RewriterBase &rewriter,
 
 bool MaterializeInDestinationOp::bufferizesToMemoryRead(
     OpOperand &opOperand, const AnalysisState &state) {
-  return &opOperand == &getSourceMutable()[0];
+  return &opOperand == &getSourceMutable();
 }
 
 bool MaterializeInDestinationOp::bufferizesToMemoryWrite(
     OpOperand &opOperand, const AnalysisState &state) {
-  return &opOperand == &getDestMutable()[0];
+  return &opOperand == &getDestMutable();
 }
 
 AliasingValueList
 MaterializeInDestinationOp::getAliasingValues(OpOperand &opOperand,
                                               const AnalysisState &state) {
-  if (&opOperand == &getDestMutable()[0])
+  if (&opOperand == &getDestMutable())
     return {{getOperation()->getResult(0), BufferRelation::Equivalent}};
   return {};
 }
@@ -764,6 +764,9 @@ LogicalResult DeallocOp::verify() {
   if (getMemrefs().size() != getConditions().size())
     return emitOpError(
         "must have the same number of conditions as memrefs to deallocate");
+  if (getRetained().size() != getUpdatedConditions().size())
+    return emitOpError("must have the same number of updated conditions "
+                       "(results) as retained operands");
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/TransformOps/BufferizationTransformOps.cpp b/mlir/lib/Dialect/Bufferization/TransformOps/BufferizationTransformOps.cpp
index 354ed162a15ea6..b7db4917a4138e 100644
--- a/mlir/lib/Dialect/Bufferization/TransformOps/BufferizationTransformOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/TransformOps/BufferizationTransformOps.cpp
@@ -47,6 +47,10 @@ void transform::BufferLoopHoistingOp::getEffects(
 LogicalResult transform::OneShotBufferizeOp::verify() {
   if (getMemcpyOp() != "memref.copy" && getMemcpyOp() != "linalg.copy")
     return emitOpError() << "unsupported memcpy op";
+  if (getPrintConflicts() && !getTestAnalysisOnly())
+    return emitOpError() << "'print_conflicts' requires 'test_analysis_only'";
+  if (getDumpAliasSets() && !getTestAnalysisOnly())
+    return emitOpError() << "'dump_alias_sets' requires 'test_analysis_only'";
   return success();
 }
 
@@ -58,6 +62,7 @@ transform::OneShotBufferizeOp::apply(transform::TransformRewriter &rewriter,
   options.allowReturnAllocsFromLoops = getAllowReturnAllocsFromLoops();
   options.allowUnknownOps = getAllowUnknownOps();
   options.bufferizeFunctionBoundaries = getBufferizeFunctionBoundaries();
+  options.dumpAliasSets = getDumpAliasSets();
   options.testAnalysisOnly = getTestAnalysisOnly();
   options.printConflicts = getPrintConflicts();
   if (getFunctionBoundaryTypeConversion().has_value())
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
index 5a6de372e2310d..7bbdeab3ea1a87 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
@@ -49,19 +49,49 @@ static LogicalResult updateDeallocIfChanged(DeallocOp deallocOp,
   return success();
 }
 
-/// Checks if 'memref' may or must alias a MemRef in 'memrefList'. It is often a
+/// Given a memref value, return the "base" value by skipping over all
+/// ViewLikeOpInterface ops (if any) in the reverse use-def chain.
+static Value getViewBase(Value value) {
+  while (auto viewLikeOp = value.getDefiningOp<ViewLikeOpInterface>())
+    value = viewLikeOp.getViewSource();
+  return value;
+}
+
+/// Return "true" if the given values are guaranteed to be different (and
+/// non-aliasing) allocations based on the fact that one value is the result
+/// of an allocation and the other value is a block argument of a parent block.
+/// Note: This is a best-effort analysis that will eventually be replaced by a
+/// proper "is same allocation" analysis. This function may return "false" even
+/// though the two values are distinct allocations.
+static bool distinctAllocAndBlockArgument(Value v1, Value v2) {
+  Value v1Base = getViewBase(v1);
+  Value v2Base = getViewBase(v2);
+  auto areDistinct = [](Value v1, Value v2) {
+    if (Operation *op = v1.getDefiningOp())
+      if (hasEffect<MemoryEffects::Allocate>(op, v1))
+        if (auto bbArg = dyn_cast<BlockArgument>(v2))
+          if (bbArg.getOwner()->findAncestorOpInBlock(*op))
+            return true;
+    return false;
+  };
+  return areDistinct(v1Base, v2Base) || areDistinct(v2Base, v1Base);
+}
+
+/// Checks if `memref` may or must alias a MemRef in `otherList`. It is often a
 /// requirement of optimization patterns that there cannot be any aliasing
-/// memref in order to perform the desired simplification. The 'allowSelfAlias'
-/// argument indicates whether 'memref' may be present in 'memrefList' which
+/// memref in order to perform the desired simplification. The `allowSelfAlias`
+/// argument indicates whether `memref` may be present in `otherList` which
 /// makes this helper function applicable to situations where we already know
-/// that 'memref' is in the list but also when we don't want it in the list.
+/// that `memref` is in the list but also when we don't want it in the list.
 static bool potentiallyAliasesMemref(AliasAnalysis &analysis,
-                                     ValueRange memrefList, Value memref,
+                                     ValueRange otherList, Value memref,
                                      bool allowSelfAlias) {
-  for (auto mr : memrefList) {
-    if (allowSelfAlias && mr == memref)
+  for (auto other : otherList) {
+    if (allowSelfAlias && other == memref)
+      continue;
+    if (distinctAllocAndBlockArgument(other, memref))
       continue;
-    if (!analysis.alias(mr, memref).isNo())
+    if (!analysis.alias(other, memref).isNo())
       return true;
   }
   return false;
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
index 77ad13dacaa983..4c5789306ad758 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
@@ -149,6 +149,8 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
           op.buildSubsetExtraction(rewriter, emptyTensorOp->getLoc());
       if (!replacement)
         continue;
+      if (emptyTensorOp == replacement.getDefiningOp())
+        continue;
       if (replacement.getType() != v.getType()) {
         rewriter.setInsertionPointAfterValue(replacement);
         replacement = rewriter.create<tensor::CastOp>(v.getLoc(), v.getType(),
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
index ad7da317f5db14..6db60b75b302b8 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
@@ -26,82 +26,9 @@ namespace bufferization {
 using namespace mlir;
 using namespace mlir::bufferization;
 
-/// Resolve all operands that are also used inside of repetitive regions of the
-/// same op. Such cases are not fully supported by One-Shot Bufferize.
-///
-/// E.g.:
-/// %r = scf.for ... iter_args(%t = %tensor) -> tensor<?xf32> {
-///   "some_use"(%tensor)
-///   ...
-/// }
-///
-/// Is converted to:
-/// %tensor_copy = bufferization.alloc_tensor copy(%tensor)
-/// %r = scf.for ... iter_args(%t = %tensor) -> tensor<?xf32> {
-///   "some_use"(%tensor_copy)
-///   ...
-/// }
-static void
-resolveUsesInRepetitiveRegions(Operation *op,
-                               const BufferizationOptions &options) {
-  IRRewriter rewriter(op->getContext());
-  AnalysisState state(options);
-
-  // Look for repetitive ops (loops).
-  op->walk([&](BufferizableOpInterface bufferizableOp) {
-    // Skip filtered ops.
-    if (!options.isOpAllowed(bufferizableOp.getOperation()))
-      return WalkResult::advance();
-
-    // Find all operands that are also used inside of a repetitive region of
-    // this op.
-    for (OpOperand &opOperand : bufferizableOp->getOpOperands()) {
-      Value operand = opOperand.get();
-      // Skip non-tensor operands.
-      if (!isa<TensorType>(operand.getType()))
-        continue;
-      // Skip operands that do not bufferize to memory writes.
-      if (!bufferizableOp.bufferizesToMemoryWrite(opOperand, state))
-        continue;
-
-      // Gather all uses inside repetitive regions.
-      SmallVector<OpOperand *> usesInsideRegion;
-      for (OpOperand &use : operand.getUses()) {
-        Operation *owner = use.getOwner();
-        if (!bufferizableOp->isProperAncestor(owner))
-          continue;
-        for (Region &r : bufferizableOp->getRegions()) {
-          if (r.findAncestorOpInRegion(*owner) &&
-              bufferizableOp.isRepetitiveRegion(r.getRegionNumber())) {
-            usesInsideRegion.push_back(&use);
-            break;
-          }
-        }
-      }
-      // Nothing to do if the operand is not used inside a repetitive region.
-      if (usesInsideRegion.empty())
-        continue;
-
-      // Insert a tensor copy and replace all uses inside of repetitive regions.
-      rewriter.setInsertionPoint(bufferizableOp);
-      auto tensorCopy = rewriter.create<AllocTensorOp>(
-          bufferizableOp->getLoc(), cast<TensorType>(operand.getType()),
-          /*dynamicSizes=*/ValueRange(),
-          /*copy=*/operand, /*memory_space=*/IntegerAttr());
-      for (OpOperand *use : usesInsideRegion)
-        use->set(tensorCopy);
-    }
-
-    return WalkResult::advance();
-  });
-}
-
 LogicalResult mlir::bufferization::insertTensorCopies(
     Operation *op, const OneShotBufferizationOptions &options,
     BufferizationStatistics *statistics) {
-  // Preprocessing: Resolve currently unsupported bufferization cases.
-  resolveUsesInRepetitiveRegions(op, options);
-
   OneShotAnalysisState state(op, options);
   // Run normal One-Shot Bufferize analysis or One-Shot Module Bufferize
   // analysis depending on whether function boundary bufferization is enabled or
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 491f4a66574616..5457d51db1cc18 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -545,16 +545,17 @@ class RegionBuilderHelper {
 
 namespace {
 
-struct EraseSelfCopyOnBuffers : OpRewritePattern<CopyOp> {
+struct EraseSelfCopy : OpRewritePattern<CopyOp> {
   using OpRewritePattern<CopyOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(CopyOp copyOp,
                                 PatternRewriter &rewriter) const override {
-    if (!copyOp.hasBufferSemantics())
-      return rewriter.notifyMatchFailure(copyOp,
-                                         "does not have buffer semantics");
-    if (copyOp.getInputs().front() != copyOp.getOutputs().front())
+    if (copyOp.getInputs() != copyOp.getOutputs())
       return rewriter.notifyMatchFailure(copyOp, "not a self copy");
-    rewriter.eraseOp(copyOp);
+    if (copyOp.hasBufferSemantics())
+      rewriter.eraseOp(copyOp);
+    else
+      rewriter.replaceOp(copyOp, copyOp.getInputs());
+
     return success();
   }
 };
@@ -563,7 +564,7 @@ struct EraseSelfCopyOnBuffers : OpRewritePattern<CopyOp> {
 
 void CopyOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                          MLIRContext *context) {
-  results.add<EraseSelfCopyOnBuffers>(context);
+  results.add<EraseSelfCopy>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 17346607fa9cd7..069c613cc246d6 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -949,7 +949,7 @@ struct FoldReshapeWithGenericOpByExpansion
           reshapeOp, "failed preconditions of fusion with producer generic op");
     }
 
-    if (!controlFoldingReshapes(&reshapeOp.getSrcMutable()[0])) {
+    if (!controlFoldingReshapes(&reshapeOp.getSrcMutable())) {
       return rewriter.notifyMatchFailure(reshapeOp,
                                          "fusion blocked by control function");
     }
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
index 2a524ceb9db887..9f58e9055acadb 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
@@ -209,6 +210,76 @@ struct ConvertMemRefLoad final : OpConversionPattern<memref::LoadOp> {
     return success();
   }
 };
+
+//===----------------------------------------------------------------------===//
+// ConvertMemRefSubview
+//===----------------------------------------------------------------------===//
+
+/// Emulating narrow ints on subview have limited support, supporting only
+/// static offset and size and stride of 1. Ideally, the subview should be
+/// folded away before running narrow type emulation, and this pattern would
+/// never run. This pattern is mostly used for testing pruposes.
+struct ConvertMemRefSubview final : OpConversionPattern<memref::SubViewOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(memref::SubViewOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    MemRefType newTy =
+        dyn_cast<MemRefType>(getTypeConverter()->convertType(op.getType()));
+    if (!newTy) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          llvm::formatv("failed to convert memref type: {0}", op.getType()));
+    }
+
+    auto convertedElementType = newTy.getElementType();
+    auto oldElementType = op.getType().getElementType();
+    int srcBits = oldElementType.getIntOrFloatBitWidth();
+    int dstBits = convertedElementType.getIntOrFloatBitWidth();
+    if (dstBits % srcBits != 0) {
+      return rewriter.notifyMatchFailure(
+          op, "only dstBits % srcBits == 0 supported");
+    }
+
+    // Only support offset for 1-D subview.
+    if (op.getType().getRank() != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(), "subview with rank > 1 is not supported");
+    }
+
+    // Only support stride of 1.
+    if (op.getStaticStride(0) != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(), "subview with stride != 1 is not supported");
+    }
+
+    int64_t size = op.getStaticSize(0);
+    int64_t offset = op.getStaticOffset(0);
+    // Only support static sizes and offsets.
+    if (size == ShapedType::kDynamic || offset == ShapedType::kDynamic) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(), "subview with dynamic size or offset is not supported");
+    }
+
+    int elementsPerByte = dstBits / srcBits;
+    if (offset % elementsPerByte != 0) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "subview with offset not multiple of elementsPerByte is not "
+          "supported");
+    }
+
+    size = ceilDiv(size, elementsPerByte);
+    offset = offset / elementsPerByte;
+
+    rewriter.replaceOpWithNewOp<memref::SubViewOp>(
+        op, newTy, *adaptor.getODSOperands(0).begin(), offset, size,
+        op.getStaticStrides());
+    return success();
+  }
+};
+
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -220,9 +291,9 @@ void memref::populateMemRefNarrowTypeEmulationPatterns(
     RewritePatternSet &patterns) {
 
   // Populate `memref.*` conversion patterns.
-  patterns
-      .add<ConvertMemRefAlloc, ConvertMemRefLoad, ConvertMemRefAssumeAlignment>(
-          typeConverter, patterns.getContext());
+  patterns.add<ConvertMemRefAlloc, ConvertMemRefLoad,
+               ConvertMemRefAssumeAlignment, ConvertMemRefSubview>(
+      typeConverter, patterns.getContext());
   memref::populateResolveExtractStridedMetadataPatterns(patterns);
 }
 
@@ -271,9 +342,22 @@ void memref::populateMemRefNarrowTypeEmulationConversions(
           return std::nullopt;
 
         StridedLayoutAttr layoutAttr;
+        // If the offset is 0, we do not need a strided layout as the stride is
+        // 1, so we only use the strided layout if the offset is not 0.
         if (offset != 0) {
-          layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset,
-                                              ArrayRef<int64_t>{1});
+          if (offset == ShapedType::kDynamic) {
+            layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset,
+                                                ArrayRef<int64_t>{1});
+          } else {
+            // Check if the number of bytes are a multiple of the loadStoreWidth
+            // and if so, divide it by the loadStoreWidth to get the offset.
+            if ((offset * width) % loadStoreWidth != 0)
+              return std::nullopt;
+            offset = (offset * width) / loadStoreWidth;
+
+            layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset,
+                                                ArrayRef<int64_t>{1});
+          }
         }
 
         return MemRefType::get(getLinearizedShape(ty, width, loadStoreWidth),
diff --git a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
index 672ef3eb4cd50f..101e099d2b644c 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
@@ -870,6 +870,92 @@ class ExtractStridedMetadataOpReinterpretCastFolder
   }
 };
 
+/// Replace `base, offset, sizes, strides =
+///              extract_strided_metadata(
+///                 cast(src) to dstTy)`
+/// With
+/// ```
+/// base, ... = extract_strided_metadata(src)
+/// offset = !dstTy.srcOffset.isDynamic()
+///            ? dstTy.srcOffset
+///            : extract_strided_metadata(src).offset
+/// sizes = for each srcSize in dstTy.srcSizes:
+///           !srcSize.isDynamic()
+///             ? srcSize
+//              : extract_strided_metadata(src).sizes[i]
+/// strides = for each srcStride in dstTy.srcStrides:
+///             !srcStrides.isDynamic()
+///               ? srcStrides
+///               : extract_strided_metadata(src).strides[i]
+/// ```
+///
+/// In other words, consume the `cast` and apply its effects
+/// on the offset, sizes, and strides or compute them directly from `src`.
+class ExtractStridedMetadataOpCastFolder
+    : public OpRewritePattern<memref::ExtractStridedMetadataOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult
+  matchAndRewrite(memref::ExtractStridedMetadataOp extractStridedMetadataOp,
+                  PatternRewriter &rewriter) const override {
+    Value source = extractStridedMetadataOp.getSource();
+    auto castOp = source.getDefiningOp<memref::CastOp>();
+    if (!castOp)
+      return failure();
+
+    Location loc = extractStridedMetadataOp.getLoc();
+    // Check if the source is suitable for extract_strided_metadata.
+    SmallVector<Type> inferredReturnTypes;
+    if (failed(extractStridedMetadataOp.inferReturnTypes(
+            rewriter.getContext(), loc, {castOp.getSource()},
+            /*attributes=*/{}, /*properties=*/nullptr, /*regions=*/{},
+            inferredReturnTypes)))
+      return rewriter.notifyMatchFailure(castOp,
+                                         "cast source's type is incompatible");
+
+    auto memrefType = cast<MemRefType>(source.getType());
+    unsigned rank = memrefType.getRank();
+    SmallVector<OpFoldResult> results;
+    results.resize_for_overwrite(rank * 2 + 2);
+
+    auto newExtractStridedMetadata =
+        rewriter.create<memref::ExtractStridedMetadataOp>(loc,
+                                                          castOp.getSource());
+
+    // Register the base_buffer.
+    results[0] = newExtractStridedMetadata.getBaseBuffer();
+
+    auto getConstantOrValue = [&rewriter](int64_t constant,
+                                          OpFoldResult ofr) -> OpFoldResult {
+      return !ShapedType::isDynamic(constant)
+                 ? OpFoldResult(rewriter.getIndexAttr(constant))
+                 : ofr;
+    };
+
+    auto [sourceStrides, sourceOffset] = getStridesAndOffset(memrefType);
+    assert(sourceStrides.size() == rank && "unexpected number of strides");
+
+    // Register the new offset.
+    results[1] =
+        getConstantOrValue(sourceOffset, newExtractStridedMetadata.getOffset());
+
+    const unsigned sizeStartIdx = 2;
+    const unsigned strideStartIdx = sizeStartIdx + rank;
+    ArrayRef<int64_t> sourceSizes = memrefType.getShape();
+
+    SmallVector<OpFoldResult> sizes = newExtractStridedMetadata.getSizes();
+    SmallVector<OpFoldResult> strides = newExtractStridedMetadata.getStrides();
+    for (unsigned i = 0; i < rank; ++i) {
+      results[sizeStartIdx + i] = getConstantOrValue(sourceSizes[i], sizes[i]);
+      results[strideStartIdx + i] =
+          getConstantOrValue(sourceStrides[i], strides[i]);
+    }
+    rewriter.replaceOp(extractStridedMetadataOp,
+                       getValueOrCreateConstantIndexOp(rewriter, loc, results));
+    return success();
+  }
+};
+
 /// Replace `base, offset =
 ///            extract_strided_metadata(extract_strided_metadata(src)#0)`
 /// With
@@ -911,6 +997,7 @@ void memref::populateExpandStridedMetadataPatterns(
                ExtractStridedMetadataOpGetGlobalFolder,
                RewriteExtractAlignedPointerAsIndexOfViewLikeOp,
                ExtractStridedMetadataOpReinterpretCastFolder,
+               ExtractStridedMetadataOpCastFolder,
                ExtractStridedMetadataOpExtractStridedMetadataFolder>(
       patterns.getContext());
 }
@@ -923,6 +1010,7 @@ void memref::populateResolveExtractStridedMetadataPatterns(
                ExtractStridedMetadataOpSubviewFolder,
                RewriteExtractAlignedPointerAsIndexOfViewLikeOp,
                ExtractStridedMetadataOpReinterpretCastFolder,
+               ExtractStridedMetadataOpCastFolder,
                ExtractStridedMetadataOpExtractStridedMetadataFolder>(
       patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index dfec1798680041..e8ecd0faa4c86d 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -367,10 +368,10 @@ LogicalResult TmaCreateDescriptorOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// NVGPU_GenerateGmmaDescriptorOp
+// NVGPU_WarpgroupGenerateDescriptorOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult GenerateGmmaDescriptorOp::verify() {
+LogicalResult WarpgroupGenerateDescriptorOp::verify() {
   MemRefType memrefType = getTensor().getType();
   MemRefType tensorMapType = getTensorMap().getType().getTensor();
 
@@ -529,6 +530,39 @@ LogicalResult WarpgroupMmaOp::verify() {
   return success();
 }
 
+LogicalResult WarpgroupMmaStoreOp::verify() {
+  MemRefType dstMemrefType = getDstMemref().getType();
+  VectorType firstVtype = getMatrixD()
+                              .front()
+                              .getType()
+                              .cast<WarpgroupAccumulatorType>()
+                              .getFragmented();
+
+  int64_t totalFirstDimension = 0;
+  for (Value result : getMatrixD()) {
+    VectorType vtype =
+        result.getType().cast<WarpgroupAccumulatorType>().getFragmented();
+    if (vtype != firstVtype)
+      return emitOpError() << "all fragmented types must be the same";
+    // Limitation
+    if (!vtype.getElementType().isF32()) {
+      return emitOpError()
+             << "hit a limitation: only f32 results for the time being";
+    }
+    totalFirstDimension += vtype.getDimSize(0);
+  }
+  if (totalFirstDimension != dstMemrefType.getDimSize(0) ||
+      firstVtype.getDimSize(1) != dstMemrefType.getDimSize(1)) {
+    return emitOpError() << "results [" << totalFirstDimension << "]["
+                         << firstVtype.getDimSize(1)
+                         << "] values. However, destination memref["
+                         << dstMemrefType.getDimSize(0) << "]["
+                         << dstMemrefType.getDimSize(1)
+                         << "]  does not have same size as results";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd dialect, type, and op definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 88c025c6b2b2e8..0d02a590f29693 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -45,6 +45,28 @@ static Value castBuffer(OpBuilder &b, Value buffer, Type type) {
   return b.create<memref::CastOp>(buffer.getLoc(), type, buffer).getResult();
 }
 
+/// Helper function for loop bufferization. Return "true" if the given value
+/// is guaranteed to not alias with an external tensor apart from values in
+/// `exceptions`. A value is external if it is defined outside of the given
+/// region or if it is an entry block argument of the region.
+static bool doesNotAliasExternalValue(Value value, Region *region,
+                                      ValueRange exceptions,
+                                      const OneShotAnalysisState &state) {
+  assert(region->getBlocks().size() == 1 &&
+         "expected region with single block");
+  bool result = true;
+  state.applyOnAliases(value, [&](Value alias) {
+    if (llvm::is_contained(exceptions, alias))
+      return;
+    Region *aliasRegion = alias.getParentRegion();
+    if (isa<BlockArgument>(alias) && !region->isProperAncestor(aliasRegion))
+      result = false;
+    if (isa<OpResult>(alias) && !region->isAncestor(aliasRegion))
+      result = false;
+  });
+  return result;
+}
+
 /// Bufferization of scf.condition.
 struct ConditionOpInterface
     : public BufferizableOpInterface::ExternalModel<ConditionOpInterface,
@@ -633,12 +655,10 @@ struct ForOpInterface
       return success();
 
     // According to the `getAliasing...` implementations, a bufferized OpResult
-    // may alias only with the corresponding bufferized init_arg and with no
-    // other buffers. I.e., the i-th OpResult may alias with the i-th init_arg;
-    // but not with any other OpOperand. If a corresponding OpResult/init_arg
-    // pair bufferizes to equivalent buffers, this aliasing requirement is
-    // satisfied. Otherwise, we cannot be sure and must yield a new buffer copy.
-    // (New buffer copies do not alias with any buffer.)
+    // may alias only with the corresponding bufferized init_arg (or with a
+    // newly allocated buffer) and not with other buffers defined outside of the
+    // loop. I.e., the i-th OpResult may alias with the i-th init_arg;
+    // but not with any other OpOperand.
     auto forOp = cast<scf::ForOp>(op);
     auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
     OpBuilder::InsertionGuard g(rewriter);
@@ -647,20 +667,24 @@ struct ForOpInterface
     // Indices of all iter_args that have tensor type. These are the ones that
     // are bufferized.
     DenseSet<int64_t> indices = getTensorIndices(forOp.getInitArgs());
-    // For every yielded value, is the value equivalent to its corresponding
-    // bbArg?
-    DenseSet<int64_t> equivalentYields = getEquivalentBuffers(
-        forOp.getRegionIterArgs(), yieldOp.getResults(), state);
+    // For every yielded value, does it alias with something defined outside of
+    // the loop?
     SmallVector<Value> yieldValues;
-    for (int64_t idx = 0;
-         idx < static_cast<int64_t>(yieldOp.getResults().size()); ++idx) {
-      Value value = yieldOp.getResults()[idx];
-      if (!indices.contains(idx) || equivalentYields.contains(idx)) {
-        yieldValues.push_back(value);
+    for (const auto it : llvm::enumerate(yieldOp.getResults())) {
+      // Note: `state` is guaranteed to be a `OneShotAnalysisState`, but this
+      // type cannot be used in the signature of `resolveConflicts` because the
+      // op interface is in the "IR" build unit and the `OneShotAnalysisState`
+      // is defined in the "Transforms" build unit.
+      if (!indices.contains(it.index()) ||
+          doesNotAliasExternalValue(
+              it.value(), &forOp.getRegion(),
+              /*exceptions=*/forOp.getRegionIterArg(it.index()),
+              static_cast<const OneShotAnalysisState &>(state))) {
+        yieldValues.push_back(it.value());
         continue;
       }
       FailureOr<Value> alloc = allocateTensorForShapedValue(
-          rewriter, yieldOp.getLoc(), value, state.getOptions());
+          rewriter, yieldOp.getLoc(), it.value(), state.getOptions());
       if (failed(alloc))
         return failure();
       yieldValues.push_back(*alloc);
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 1ecfe7906c571f..96d6169111b385 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -526,7 +526,7 @@ mlir::scf::tileAndFuseProducerOfSlice(RewriterBase &rewriter,
   // 1. Get the producer of the source (potentially walking through
   // `iter_args` of nested `scf.for`)
   auto [fusableProducer, destinationInitArg] =
-      getUntiledProducerFromSliceSource(&candidateSliceOp.getSourceMutable()[0],
+      getUntiledProducerFromSliceSource(&candidateSliceOp.getSourceMutable(),
                                         loops);
   if (!fusableProducer)
     return std::nullopt;
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index 6938a7ad783ba8..053e067fff64dd 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -68,7 +68,7 @@ FailureOr<uint8_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   } else if (base.compare("block2_4") == 0) {
     properties |= static_cast<uint8_t>(LevelFormat::TwoOutOfFour);
   } else if (base.compare("loose_compressed") == 0) {
-    properties |= static_cast<uint8_t>(LevelFormat::CompressedWithHi);
+    properties |= static_cast<uint8_t>(LevelFormat::LooseCompressed);
   } else if (base.compare("singleton") == 0) {
     properties |= static_cast<uint8_t>(LevelFormat::Singleton);
   } else {
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index b962dda20cfe64..96ed5f13b9d9ec 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -422,10 +422,10 @@ const static DimLevelType validDLTs[] = {DimLevelType::Dense,
                                          DimLevelType::SingletonNu,
                                          DimLevelType::SingletonNo,
                                          DimLevelType::SingletonNuNo,
-                                         DimLevelType::CompressedWithHi,
-                                         DimLevelType::CompressedWithHiNu,
-                                         DimLevelType::CompressedWithHiNo,
-                                         DimLevelType::CompressedWithHiNuNo};
+                                         DimLevelType::LooseCompressed,
+                                         DimLevelType::LooseCompressedNu,
+                                         DimLevelType::LooseCompressedNo,
+                                         DimLevelType::LooseCompressedNuNo};
 
 static std::optional<DimLevelType> parseDLT(StringRef str) {
   for (DimLevelType dlt : validDLTs)
@@ -586,31 +586,67 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) {
 }
 
 void SparseTensorEncodingAttr::print(AsmPrinter &printer) const {
-  // Print the struct-like storage in dictionary fashion.
-  printer << "<{ lvlTypes = [ ";
-  llvm::interleaveComma(getLvlTypes(), printer, [&](DimLevelType dlt) {
-    printer << "\"" << toMLIRString(dlt) << "\"";
-  });
-  printer << " ]";
+  auto map = static_cast<AffineMap>(getDimToLvl());
+  // Empty affine map indicates identity map
+  if (!map)
+    map = AffineMap::getMultiDimIdentityMap(getLvlTypes().size(), getContext());
+  printer << "<{ map = ";
+  printSymbols(map, printer);
+  printer << '(';
+  printDimensions(map, printer, getDimSlices());
+  printer << ") -> (";
+  printLevels(map, printer, getLvlTypes());
+  printer << ')';
   // Print remaining members only for non-default values.
-  if (!isIdentity())
-    printer << ", dimToLvl = affine_map<" << getDimToLvl() << ">";
   if (getPosWidth())
     printer << ", posWidth = " << getPosWidth();
   if (getCrdWidth())
     printer << ", crdWidth = " << getCrdWidth();
-  if (!getDimSlices().empty()) {
-    printer << ", dimSlices = [ ";
-    llvm::interleaveComma(getDimSlices(), printer,
-                          [&](SparseTensorDimSliceAttr attr) {
-                            // Calls SparseTensorDimSliceAttr::print directly to
-                            // skip mnemonic.
-                            attr.print(printer);
-                          });
-    printer << " ]";
+  printer << " }>";
+}
+
+void SparseTensorEncodingAttr::printSymbols(AffineMap &map,
+                                            AsmPrinter &printer) const {
+  if (map.getNumSymbols() == 0)
+    return;
+  printer << '[';
+  for (unsigned i = 0, n = map.getNumSymbols() - 1; i < n; i++)
+    printer << 's' << i << ", ";
+  if (map.getNumSymbols() >= 1)
+    printer << 's' << map.getNumSymbols() - 1;
+  printer << ']';
+}
+
+void SparseTensorEncodingAttr::printDimensions(
+    AffineMap &map, AsmPrinter &printer,
+    ArrayRef<SparseTensorDimSliceAttr> dimSlices) const {
+  if (!dimSlices.empty()) {
+    for (unsigned i = 0, n = map.getNumDims() - 1; i < n; i++)
+      printer << 'd' << i << " : " << dimSlices[i] << ", ";
+    if (map.getNumDims() >= 1) {
+      printer << 'd' << map.getNumDims() - 1 << " : "
+              << dimSlices[map.getNumDims() - 1];
+    }
+  } else {
+    for (unsigned i = 0, n = map.getNumDims() - 1; i < n; i++)
+      printer << 'd' << i << ", ";
+    if (map.getNumDims() >= 1)
+      printer << 'd' << map.getNumDims() - 1;
   }
+}
 
-  printer << " }>";
+void SparseTensorEncodingAttr::printLevels(
+    AffineMap &map, AsmPrinter &printer,
+    ArrayRef<DimLevelType> lvlTypes) const {
+  for (unsigned i = 0, n = map.getNumResults() - 1; i < n; i++) {
+    map.getResult(i).print(printer.getStream());
+    printer << " : " << toMLIRString(lvlTypes[i]) << ", ";
+  }
+  if (map.getNumResults() >= 1) {
+    auto lastIndex = map.getNumResults() - 1;
+    map.getResult(lastIndex).print(printer.getStream());
+    printer << " : " << toMLIRString(lvlTypes[lastIndex]);
+  }
 }
 
 LogicalResult
@@ -712,7 +748,7 @@ mlir::sparse_tensor::getSparseTensorEncoding(Type type) {
 bool mlir::sparse_tensor::isCOOType(SparseTensorEncodingAttr enc,
                                     Level startLvl, bool isUnique) {
   if (!enc ||
-      !(enc.isCompressedLvl(startLvl) || enc.isCompressedWithHiLvl(startLvl)))
+      !(enc.isCompressedLvl(startLvl) || enc.isLooseCompressedLvl(startLvl)))
     return false;
   const Level lvlRank = enc.getLvlRank();
   for (Level l = startLvl + 1; l < lvlRank; ++l)
@@ -1363,7 +1399,7 @@ LogicalResult SelectOp::verify() {
   return success();
 }
 
-LogicalResult SortCooOp::verify() {
+LogicalResult SortOp::verify() {
   AffineMap xPerm = getPermMap();
   uint64_t nx = xPerm.getNumDims();
   if (nx < 1)
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
index a6468b3e14795f..8145446751b993 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -308,17 +308,46 @@ Value reshapeValuesToLevels(OpBuilder &builder, Location loc,
                             SparseTensorEncodingAttr enc, ValueRange dimSizes,
                             Value valuesBuffer, Value lvlCoords);
 
+// Generates code to cast a tensor to a memref.
+TypedValue<BaseMemRefType> genToMemref(OpBuilder &builder, Location loc,
+                                       Value tensor);
+
+/// Infers the result type and generates `ToPositionsOp`.
+Value genToPositions(OpBuilder &builder, Location loc, Value tensor, Level lvl);
+
+/// Infers the result type and generates `ToCoordinatesOp`.  If the
+/// level is within a COO region, the result type is a memref with unknown
+/// stride and offset.  Otherwise, the result type is a memref without
+/// any specified layout.
+Value genToCoordinates(OpBuilder &builder, Location loc, Value tensor,
+                       Level lvl, Level cooStart);
+
+/// Infers the result type and generates `ToCoordinatesBufferOp`.
+Value genToCoordinatesBuffer(OpBuilder &builder, Location loc, Value tensor);
+
+/// Infers the result type and generates `ToValuesOp`.
+Value genToValues(OpBuilder &builder, Location loc, Value tensor);
+
+/// Generates code to retrieve the values size for the sparse tensor.
+Value genValMemSize(OpBuilder &builder, Location loc, Value tensor);
+
+/// Generates code to retrieve the slice offset for the sparse tensor slice,
+/// return a constant if the offset is statically known.
+Value createOrFoldSliceOffsetOp(OpBuilder &builder, Location loc, Value tensor,
+                                Dimension dim);
+
+/// Generates code to retrieve the slice slice for the sparse tensor slice,
+/// return a constant if the offset is statically known.
+Value createOrFoldSliceStrideOp(OpBuilder &builder, Location loc, Value tensor,
+                                Dimension dim);
+
 //===----------------------------------------------------------------------===//
 // Inlined constant generators.
 //
 // All these functions are just wrappers to improve code legibility;
 // therefore, we mark them as `inline` to avoid introducing any additional
-// overhead due to the legibility.
+// overhead due to the legibility. Ideally these should move upstream.
 //
-// TODO: Ideally these should move upstream, so that we don't
-// develop a design island.  However, doing so will involve
-// substantial design work.  For related prior discussion, see
-// <https://llvm.discourse.group/t/evolving-builder-apis-based-on-lessons-learned-from-edsc/879>
 //===----------------------------------------------------------------------===//
 
 /// Generates a 0-valued constant of the given type.  In addition to
@@ -420,38 +449,6 @@ inline bool isZeroRankedTensorOrScalar(Type type) {
   return !rtp || rtp.getRank() == 0;
 }
 
-// Generates code to cast a tensor to a memref.
-TypedValue<BaseMemRefType> genToMemref(OpBuilder &builder, Location loc,
-                                       Value tensor);
-
-/// Infers the result type and generates `ToPositionsOp`.
-Value genToPositions(OpBuilder &builder, Location loc, Value tensor, Level lvl);
-
-/// Infers the result type and generates `ToCoordinatesOp`.  If the
-/// level is within a COO region, the result type is a memref with unknown
-/// stride and offset.  Otherwise, the result type is a memref without
-/// any specified layout.
-Value genToCoordinates(OpBuilder &builder, Location loc, Value tensor,
-                       Level lvl, Level cooStart);
-
-/// Infers the result type and generates `ToCoordinatesBufferOp`.
-Value genToCoordinatesBuffer(OpBuilder &builder, Location loc, Value tensor);
-
-/// Infers the result type and generates `ToValuesOp`.
-Value genToValues(OpBuilder &builder, Location loc, Value tensor);
-
-/// Generates code to retrieve the values size for the sparse tensor.
-Value genValMemSize(OpBuilder &builder, Location loc, Value tensor);
-
-/// Generates code to retrieve the slice offset for the sparse tensor slice,
-/// return a constant if the offset is statically known.
-Value createOrFoldSliceOffsetOp(OpBuilder &builder, Location loc, Value tensor,
-                                Dimension dim);
-
-/// Generates code to retrieve the slice slice for the sparse tensor slice,
-/// return a constant if the offset is statically known.
-Value createOrFoldSliceStrideOp(OpBuilder &builder, Location loc, Value tensor,
-                                Dimension dim);
 } // namespace sparse_tensor
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
index 9feaceac2f51bd..2d4f40eceba3b0 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
@@ -242,7 +242,7 @@ Value LoopEmitter::genSegmentHigh(OpBuilder &builder, Location loc,
         {
           OpBuilder::InsertionGuard guard(builder);
           // Load the next coordinates only when inbound (to avoid OOB
-          // acccesses).
+          // accesses).
           builder.setInsertionPointToStart(ifInBound.thenBlock());
           Value crd = genIndexLoad(builder, loc, coordinates, pos);
           Value isSameCrd = builder.create<arith::CmpIOp>(
@@ -433,7 +433,7 @@ void LoopEmitter::initializeLoopEmit(
              !highs[t][l]);
       const auto lvlTp = lvlTypes[t][l];
       // Handle sparse storage schemes.
-      if (isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp)) {
+      if (isCompressedDLT(lvlTp) || isLooseCompressedDLT(lvlTp)) {
         // Generate sparse primitives to obtain positions and coordinates.
         positionsBuffers[t][l] = genToPositions(builder, loc, tensor, l);
         coordinatesBuffers[t][l] =
@@ -534,7 +534,7 @@ void LoopEmitter::categorizeLoopCondition(
     auto lvlType = lvlTypes[t][l];
     // Must be a recognizable DLT.
     assert(isDenseDLT(lvlType) || isCompressedDLT(lvlType) ||
-           isCompressedWithHiDLT(lvlType) || isSingletonDLT(lvlType));
+           isLooseCompressedDLT(lvlType) || isSingletonDLT(lvlType));
 
     bool isSparse = !isDenseDLT(lvlType);
     bool isSlice = isSparseSlices[t];
@@ -630,7 +630,7 @@ std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl(
     OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value lo,
     Value hi, MutableArrayRef<Value> reduc, bool isParallel) {
   bool isSparseCond = isCompressedDLT(lvlTypes[tid][lvl]) ||
-                      isCompressedWithHiDLT(lvlTypes[tid][lvl]) ||
+                      isLooseCompressedDLT(lvlTypes[tid][lvl]) ||
                       isSingletonDLT(lvlTypes[tid][lvl]);
   // TODO: support dynamic slices.
   // Uses the first dimension here to build the loop bound (which is also the
@@ -651,7 +651,7 @@ std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl(
     // expression on init vals will be moved into scf.reduce and replaced with
     // the block arguments when exiting the loop (see exitForLoop). This is
     // needed as we can not build the actual reduction block and get the actual
-    // reduction varaible before users fill parallel loop body.
+    // reduction variable before users fill parallel loop body.
     for (int i = 0, e = reduc.size(); i < e; i++)
       reduc[i] = parOp.getInitVals()[i];
     loop = parOp;
@@ -882,7 +882,7 @@ std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
 
   // The set of induction variables for the while loop.
   SmallVector<Value> ivs;
-  // Segement sizes for induction variables used for different kinds of loop
+  // Segment sizes for induction variables used for different kinds of loop
   // conditions.
   SmallVector<unsigned> opSegSize;
 
@@ -893,7 +893,7 @@ std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
     // Dense level are handled by the shared univeral index.
     assert(!isDenseCond(cKind));
     // Must be a recognizable sparse level.
-    assert(isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp) ||
+    assert(isCompressedDLT(lvlTp) || isLooseCompressedDLT(lvlTp) ||
            isSingletonDLT(lvlTp));
     (void)lvlTp;
 
@@ -1012,7 +1012,7 @@ std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
     for (auto [tid, lvl] : unpackTensorLevelFromCondRange(spConds)) {
       const auto lvlTp = lvlTypes[tid][lvl];
       if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
-          isCompressedWithHiDLT(lvlTp)) {
+          isLooseCompressedDLT(lvlTp)) {
         const auto crd = coords[tid][lvl];
         if (min) {
           Value cmp = CMPI(ult, coords[tid][lvl], min);
@@ -1077,7 +1077,7 @@ Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
   needsUniv = !spConds.empty() && needsUniv;
   // The TensorLevel used for loop conditions.
   // If there is any sparse level, we need to use the sparse condition.
-  // If all levels are dense, we can pick arbitary one (dense slice-driven loop
+  // If all levels are dense, we can pick arbitrary one (dense slice-driven loop
   // can be generated using a simple ForOp as well).
   Operation *l = nullptr;
   Value iv = nullptr;
@@ -1237,11 +1237,11 @@ void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
   // Either the first level, or the previous level has been set.
   /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
   assert(lvl == 0 || posits[tid][lvl - 1]);
-  if (isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp)) {
+  if (isCompressedDLT(lvlTp) || isLooseCompressedDLT(lvlTp)) {
     const Value mem = positionsBuffers[tid][lvl];
 
     Value pLo = lvl == 0 ? c0 : posits[tid][lvl - 1];
-    if (isCompressedWithHiDLT(lvlTp))
+    if (isLooseCompressedDLT(lvlTp))
       pLo = builder.create<arith::MulIOp>(loc, pLo, C_IDX(2));
     posits[tid][lvl] = genIndexLoad(builder, loc, mem, pLo);
 
@@ -1538,7 +1538,7 @@ void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
   for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.trivialTidLvls)) {
     const auto lvlTp = lvlTypes[tid][lvl];
     if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
-        isCompressedWithHiDLT(lvlTp)) {
+        isLooseCompressedDLT(lvlTp)) {
       const Value crd = coords[tid][lvl];
       const Value pos = posits[tid][lvl];
       Value cmp = CMPI(eq, crd, iv);
@@ -1700,7 +1700,7 @@ std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop(
           // Delegates to users' callback.
           bodyBuilder(builder, loc, iv, ifRet);
         }
-        // Marks this speical ifOp to avoid sparisification finalizing it.
+        // Marks this special ifOp to avoid sparisification finalizing it.
         ifOp->setAttr(getLoopEmitterLoopAttrName(),
                       StringAttr::get(builder.getContext(), "slice"));
         // Insertion point restored to after ifOp.
@@ -1741,7 +1741,7 @@ ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse(
   Value pos = c0;
   OpBuilder::InsertPoint ip;
   SmallVector<Value> innerArgs(userReduc.begin(), userReduc.end());
-  scf::ForOp outerMost = nullptr; // the outtermost loop.
+  scf::ForOp outerMost = nullptr; // the outermost loop.
 
   // Wraps body builder and inserts a extra counting instruction at the end.
   auto wrapped = [bodyBuilder](OpBuilder &builder, Location loc, Value iv,
@@ -1842,7 +1842,7 @@ ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse(
                                OpBuilder &builder, Location loc, ValueRange ivs,
                                ValueRange iterArgs) -> scf::ValueVector {
                              for (auto em : llvm::enumerate(ivs)) {
-                               // Linearizes postion: pos = (pos * lvlsize) +
+                               // Linearizes position: pos = (pos * lvlsize) +
                                // iv;
                                pos = MULI(pos, lvlSzs[em.index()]);
                                pos = ADDI(pos, em.value());
@@ -2072,7 +2072,7 @@ bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
   assert(isOrderedDLT(lvlType));
   if (isSingletonDLT(lvlType)) {
     llvm_unreachable("TODO: dense level should be easy to support, while "
-                     "singleton level requres more efforts");
+                     "singleton level requires more efforts");
   }
 
   assert(!dependentLvlMap[tid][lvl].empty());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
index 3181395a474cfe..ced7983af324c1 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
@@ -1398,11 +1398,11 @@ struct PushBackRewriter : OpRewritePattern<PushBackOp> {
 };
 
 /// Sparse rewriting rule for the sort_coo operator.
-struct SortCooRewriter : public OpRewritePattern<SortCooOp> {
+struct SortRewriter : public OpRewritePattern<SortOp> {
 public:
-  using OpRewritePattern<SortCooOp>::OpRewritePattern;
+  using OpRewritePattern<SortOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(SortCooOp op,
+  LogicalResult matchAndRewrite(SortOp op,
                                 PatternRewriter &rewriter) const override {
     SmallVector<Value> xys;
     xys.push_back(op.getXy());
@@ -1427,5 +1427,5 @@ void mlir::populateSparseBufferRewriting(RewritePatternSet &patterns,
                                          bool enableBufferInitialization) {
   patterns.add<PushBackRewriter>(patterns.getContext(),
                                  enableBufferInitialization);
-  patterns.add<SortCooRewriter>(patterns.getContext());
+  patterns.add<SortRewriter>(patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 3a3ea311c49d98..7c362c086623b4 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -472,8 +472,17 @@ class SparseInsertGenerator
     llvm::raw_svector_ostream nameOstream(nameBuffer);
     nameOstream << kInsertFuncNamePrefix;
     const Level lvlRank = stt.getLvlRank();
-    for (Level l = 0; l < lvlRank; l++)
-      nameOstream << toMLIRString(stt.getLvlType(l)) << "_";
+    for (Level l = 0; l < lvlRank; l++) {
+      std::string lvlType = toMLIRString(stt.getLvlType(l));
+      // Replace/remove punctuations in level properties.
+      std::replace_if(
+          lvlType.begin(), lvlType.end(),
+          [](char c) { return c == '(' || c == ','; }, '_');
+      lvlType.erase(std::remove_if(lvlType.begin(), lvlType.end(),
+                                   [](char c) { return c == ')' || c == ' '; }),
+                    lvlType.end());
+      nameOstream << lvlType << "_";
+    }
     // Static dim sizes are used in the generated code while dynamic sizes are
     // loaded from the dimSizes buffer. This is the reason for adding the shape
     // to the function name.
@@ -498,7 +507,7 @@ static void genEndInsert(OpBuilder &builder, Location loc,
   const Level lvlRank = stt.getLvlRank();
   for (Level l = 0; l < lvlRank; l++) {
     const auto dlt = stt.getLvlType(l);
-    if (isCompressedWithHiDLT(dlt))
+    if (isLooseCompressedDLT(dlt))
       llvm_unreachable("TODO: Not yet implemented");
     if (isCompressedDLT(dlt)) {
       // Compressed dimensions need a position cleanup for all entries
@@ -931,7 +940,7 @@ class SparseCompressConverter : public OpConversionPattern<CompressOp> {
     // If the innermost level is ordered, we need to sort the coordinates
     // in the "added" array prior to applying the compression.
     if (dstType.isOrderedLvl(dstType.getLvlRank() - 1))
-      rewriter.create<SortCooOp>(
+      rewriter.create<SortOp>(
           loc, count, added, ValueRange{}, rewriter.getMultiDimIdentityMap(1),
           rewriter.getIndexAttr(0), SparseTensorSortKind::HybridQuickSort);
     // While performing the insertions, we also need to reset the elements
@@ -1237,7 +1246,7 @@ class SparseNumberOfEntriesConverter
                   ConversionPatternRewriter &rewriter) const override {
     // Query memSizes for the actually stored values.
     // FIXME: the nse value computed in this way might be wrong when there is
-    // any "compressed_hi" level.
+    // any "loose_compressed" level.
     rewriter.replaceOp(
         op, genValMemSize(rewriter, op.getLoc(), adaptor.getTensor()));
     return success();
@@ -1316,8 +1325,8 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
       }
 
       if (isDLTWithPos(dlt)) {
-        assert(isCompressedDLT(dlt) || isCompressedWithHiDLT(dlt));
-        if (isCompressedWithHiDLT(dlt)) {
+        assert(isCompressedDLT(dlt) || isLooseCompressedDLT(dlt));
+        if (isLooseCompressedDLT(dlt)) {
           memSize = rewriter.create<arith::MulIOp>(loc, memSize, c2);
           posBack = rewriter.create<arith::SubIOp>(loc, memSize, c1);
         } else {
@@ -1531,9 +1540,9 @@ struct SparseNewOpConverter : public OpConversionPattern<NewOp> {
           rewriter.create<scf::IfOp>(loc, notSorted, /*else*/ false);
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
       auto xPerm = rewriter.getMultiDimIdentityMap(lvlRank);
-      rewriter.create<SortCooOp>(loc, nse, xs, ValueRange{ys}, xPerm,
-                                 rewriter.getIndexAttr(0),
-                                 SparseTensorSortKind::HybridQuickSort);
+      rewriter.create<SortOp>(loc, nse, xs, ValueRange{ys}, xPerm,
+                              rewriter.getIndexAttr(0),
+                              SparseTensorSortKind::HybridQuickSort);
       rewriter.setInsertionPointAfter(ifOp);
     }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index 37f6971cf4df1a..a3361c2cd48c6d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -162,38 +162,6 @@ static SmallVector<Value> getDimShape(OpBuilder &builder, Location loc,
   return out;
 }
 
-/// Populates the given sizes array for concatenation from type (for static
-/// sizes) and from an already-converted opaque pointer source (for dynamic
-/// sizes).
-static void concatDimSizesFromInputs(OpBuilder &builder, Location loc,
-                                     SparseTensorType dstTp, ValueRange srcs,
-                                     Dimension dim,
-                                     SmallVectorImpl<Value> &dimSizes) {
-  assert(dim < dstTp.getDimRank() && "Dimension is out of bounds");
-  dimSizes.clear();
-
-  // We first fills the sizes from an input tensor, and then
-  // compute the size of the concatenation dimension if necessary.
-  const auto srcTp = getSparseTensorType(srcs[0]);
-  if (srcTp.hasEncoding())
-    // Reuses sizes from an arbitrary input tensor is fine.
-    fillDimSizes(builder, loc, srcTp, srcs[0], dimSizes);
-  else
-    sizesFromSrc(builder, dimSizes, loc, srcs[0]);
-
-  if (const auto sz = dstTp.getStaticDimSize(dim)) {
-    // Faithfully take the static size.
-    dimSizes[dim] = constantIndex(builder, loc, *sz);
-  } else {
-    // Else, dynamically compute the size.
-    for (const auto src : srcs.drop_front()) {
-      const auto srcTp = getSparseTensorType(src);
-      Value srcSz = createOrFoldDimCall(builder, loc, srcTp, src, dim);
-      dimSizes[dim] = builder.create<arith::AddIOp>(loc, dimSizes[dim], srcSz);
-    }
-  }
-}
-
 /// Generates an uninitialized buffer of the given size and type,
 /// but returns it as type `memref<? x $tp>` (rather than as type
 /// `memref<$sz x $tp>`). Unlike temporary buffers on the stack,
@@ -467,195 +435,6 @@ static bool canUseDirectConversion(ArrayRef<DimLevelType> dimTypes) {
   return true;
 }
 
-/// Helper method to translate coordinates during a reshaping operation.
-/// TODO: provide as general utility to MLIR at large?
-static void reshapeCoords(Location loc, OpBuilder &builder,
-                          ArrayRef<ReassociationIndices> reassociation,
-                          ValueRange srcSizes, Value srcCoords,
-                          ValueRange dstSizes, Value dstCoords) {
-  const auto srcCvs = loadAll(builder, loc, srcSizes.size(), srcCoords);
-  SmallVector<Value> dstCvs;
-  reshapeCvs(builder, loc, reassociation, srcSizes, srcCvs, dstSizes, dstCvs);
-  assert(dstCvs.size() == dstSizes.size());
-  storeAll(builder, loc, dstCoords, dstCvs);
-}
-
-/// Generate code for a general sparse to sparse reshaping operation.
-/// Note that unlike dense reshaping (which can be done with a "cheap"
-/// change of view), sparse reshaping is currently done with actual
-/// data shuffling.
-///
-/// TODO: proportional to nnz, but still a lot of data movement
-///       https://github.com/llvm/llvm-project/issues/56477
-///
-///   iter = src->toCOO();
-///   coo = newSparseCOO()
-///   while (elem = iter->getNext()) {
-///     coo->add(reshape(elem.coords), elem.value)
-///   }
-///   s = newSparseTensor(coo)
-template <typename ReshapeOp>
-static LogicalResult
-genSparse2SparseReshape(ReshapeOp op, typename ReshapeOp::Adaptor adaptor,
-                        ConversionPatternRewriter &rewriter) {
-  Location loc = op.getLoc();
-  const auto srcTp = getSparseTensorType(op.getSrc());
-  const auto dstTp = getSparseTensorType(op.getResult());
-  if (!srcTp.hasEncoding() || !dstTp.hasEncoding())
-    return failure();
-  Type elemTp = srcTp.getElementType();
-  assert(elemTp == dstTp.getElementType() &&
-         "reshape should not change element type");
-  // Start an iterator over the source tensor (in coordinate order).
-  SmallVector<Value> srcDimSizes =
-      getDimSizes(rewriter, loc, srcTp, adaptor.getSrc());
-  NewCallParams params(rewriter, loc);
-  Value iter = params.genBuffers(srcTp.withoutDimToLvl(), srcDimSizes)
-                   .genNewCall(Action::kToIterator, adaptor.getSrc());
-  // Start a new COO for the destination tensor.
-  SmallVector<Value> dstDimSizes;
-  if (dstTp.hasStaticDimShape())
-    // Static "shapes" are in fact "sizes".
-    fillDimShape(rewriter, loc, dstTp, dstDimSizes);
-  else
-    genReshapeDstShape(rewriter, loc, dstDimSizes, srcDimSizes,
-                       dstTp.getDimShape(), op.getReassociationIndices());
-  const Value coo =
-      params.genBuffers(dstTp, dstDimSizes).genNewCall(Action::kEmptyCOO);
-  const Value dstDimToLvl = params.getDimToLvl();
-  // Construct a while loop over the iterator.
-  const Type iTp = rewriter.getIndexType();
-  const Value srcDimCoords = genAlloca(rewriter, loc, srcTp.getDimRank(), iTp);
-  const Value dstDimCoords = genAlloca(rewriter, loc, dstTp.getDimRank(), iTp);
-  const Value elemPtr = genAllocaScalar(rewriter, loc, elemTp);
-  const SmallVector<Value> noArgs;
-  const SmallVector<Type> noTypes;
-  auto whileOp = rewriter.create<scf::WhileOp>(loc, noTypes, noArgs);
-  Block *before = rewriter.createBlock(&whileOp.getBefore(), {}, noTypes);
-  rewriter.setInsertionPointToEnd(before);
-  Value cond = genGetNextCall(rewriter, loc, iter, srcDimCoords, elemPtr);
-  rewriter.create<scf::ConditionOp>(loc, cond, before->getArguments());
-  // Translate coordinates from source to target and insert. Note that we do
-  // not need to store the value in elemPtr, as the value is still there.
-  Block *after = rewriter.createBlock(&whileOp.getAfter(), {}, noTypes);
-  rewriter.setInsertionPointToStart(after);
-  // We probably don't need these assertions, but better safe than sorry.
-  assert(srcTp.getDimRank() == srcDimSizes.size());
-  assert(dstTp.getDimRank() == dstDimSizes.size());
-  reshapeCoords(loc, rewriter, op.getReassociationIndices(), srcDimSizes,
-                srcDimCoords, dstDimSizes, dstDimCoords);
-  genAddEltCall(rewriter, loc, elemTp, coo, elemPtr, dstDimCoords, dstDimToLvl);
-  rewriter.create<scf::YieldOp>(loc);
-  // Final call to construct sparse tensor storage and free temporary resources.
-  rewriter.setInsertionPointAfter(whileOp);
-  Value dst = params.genNewCall(Action::kFromCOO, coo);
-  genDelCOOCall(rewriter, loc, elemTp, coo);
-  genDelIteratorCall(rewriter, loc, elemTp, iter);
-  rewriter.replaceOp(op, dst);
-  return success();
-}
-
-// Generates a while loop that iterates over the COO list extracted
-// from `t`, using `bodyBuilder` to build the loop body.
-//   while (elem = coo->getNext()) {
-//     bodyBuilder
-//   }
-// TODO: It can be used by other operators (ReshapeOp, ConvertOP) conversion to
-// reduce code repetition!
-// TODO: rename to `genSparseIterationLoop`?
-static void genSparseCOOIterationLoop(
-    ConversionPatternRewriter &rewriter, Location loc, Value t,
-    SparseTensorType stt,
-    function_ref<void(OpBuilder &, Location, Value, Value)> bodyBuilder) {
-  assert(stt.hasEncoding() &&
-         "Generating Sparse Tensor COO Loop on a Dense Tensor!");
-  const Dimension dimRank = stt.getDimRank();
-  const Type elemTp = stt.getElementType();
-
-  // Start an iterator over the tensor (in coordinate order).
-  const auto noPerm = stt.withoutDimToLvl();
-  SmallVector<Value> dimSizes = getDimSizes(rewriter, loc, noPerm, t);
-  Value iter = NewCallParams(rewriter, loc)
-                   .genBuffers(noPerm, dimSizes)
-                   .genNewCall(Action::kToIterator, t);
-
-  // Construct a while loop over the iterator.
-  const Type iTp = rewriter.getIndexType();
-  Value srcDimCoords = genAlloca(rewriter, loc, dimRank, iTp);
-  Value elemPtr = genAllocaScalar(rewriter, loc, elemTp);
-  const SmallVector<Value> noArgs;
-  const SmallVector<Type> noTypes;
-  auto whileOp = rewriter.create<scf::WhileOp>(loc, noTypes, noArgs);
-  Block *before = rewriter.createBlock(&whileOp.getBefore(), {}, noTypes);
-  rewriter.setInsertionPointToEnd(before);
-  Value cond = genGetNextCall(rewriter, loc, iter, srcDimCoords, elemPtr);
-  rewriter.create<scf::ConditionOp>(loc, cond, before->getArguments());
-  Block *after = rewriter.createBlock(&whileOp.getAfter(), {}, noTypes);
-  rewriter.setInsertionPointToStart(after);
-
-  const bool hasDenseDim =
-      llvm::any_of(stt.getEncoding().getLvlTypes(), isDenseDLT);
-  if (hasDenseDim) {
-    Value elemV = rewriter.create<memref::LoadOp>(loc, elemPtr);
-    Value isZero = genIsNonzero(rewriter, loc, elemV);
-    scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, isZero, /*else*/ false);
-    rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
-  }
-  // Callback here to build loop body.
-  bodyBuilder(rewriter, loc, srcDimCoords, elemPtr);
-
-  // Exit the scope from the IfOp.
-  if (hasDenseDim)
-    rewriter.setInsertionPointToEnd(after);
-
-  rewriter.create<scf::YieldOp>(loc);
-  // Finish generating loop.
-  rewriter.setInsertionPointAfter(whileOp);
-
-  // Free memory for iterator.
-  genDelIteratorCall(rewriter, loc, elemTp, iter);
-}
-
-// Generate loop that iterates over a dense tensor.
-//   for i1 in dim1
-//    ..
-//     for ik in dimk
-//       val = a[i1,..,ik]
-//       if val != 0
-//         bodyBuilder(v, [i1, ..., ik])
-// TODO: It can be used by other operators (ReshapeOp, ConvertOP) conversion to
-// reduce code repetition!
-static void genDenseTensorIterationLoop(
-    ConversionPatternRewriter &rewriter, Location loc, Value t,
-    SparseTensorType stt,
-    function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilder) {
-  assert(!stt.hasEncoding() &&
-         "Generating Dense Tensor Loop on a Sparse Tensor!");
-
-  const Dimension dimRank = stt.getDimRank();
-  Value zero = constantIndex(rewriter, loc, 0);
-  Value one = constantIndex(rewriter, loc, 1);
-
-  SmallVector<Value> lo;
-  SmallVector<Value> hi;
-  SmallVector<Value> st;
-
-  // Fill out loop iteration information.
-  for (Dimension d = 0; d < dimRank; d++) {
-    lo.push_back(zero);
-    hi.push_back(linalg::createOrFoldDimOp(rewriter, loc, t, d));
-    st.push_back(one);
-  }
-
-  scf::buildLoopNest(rewriter, loc, lo, hi, st, {},
-                     [&](OpBuilder &builder, Location loc, ValueRange ivs,
-                         ValueRange args) -> scf::ValueVector {
-                       // Invoke callback to build the body of the loop.
-                       bodyBuilder(builder, loc, ivs);
-                       return {};
-                     });
-}
-
 //===----------------------------------------------------------------------===//
 // Conversion rules.
 //===----------------------------------------------------------------------===//
@@ -713,19 +492,6 @@ class SparseCastConverter : public OpConversionPattern<tensor::CastOp> {
   }
 };
 
-/// Sparse conversion rule for a reshape operator.
-template <typename ReshapeOp>
-class SparseReshapeConverter : public OpConversionPattern<ReshapeOp> {
-public:
-  using OpAdaptor = typename OpConversionPattern<ReshapeOp>::OpAdaptor;
-  using OpConversionPattern<ReshapeOp>::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(ReshapeOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    return genSparse2SparseReshape(op, adaptor, rewriter);
-  }
-};
-
 /// Sparse conversion rule for the new operator.
 class SparseTensorNewConverter : public OpConversionPattern<NewOp> {
 public:
@@ -1299,168 +1065,6 @@ class SparseTensorCompressConverter : public OpConversionPattern<CompressOp> {
   }
 };
 
-/// Sparse conversion rule for the concatenate operator.
-class SparseTensorConcatConverter : public OpConversionPattern<ConcatenateOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(ConcatenateOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    // The conversion works as follow:
-    // (1). When output is sparse and not all dims are dense, and mix of inputs:
-    //    a_sparse = concat (b_dense, c_sparse, ....)
-    // =>
-    //    coo_for_a = newSparseCOO(shapeOf(a))
-    //    for i, j, k // dense input
-    //      coo->add(adjustForOffset(i,j,k), b[i,j,k])
-    //
-    //    for elem in sparse_input
-    //      coo->add(adjustForOffset(elem.coords), elem.value)
-    //    ...
-    //    a = newSparseTensor(coo_for_a)
-    //    return a
-    //
-    // (2). When output is dense or annotated all dense, and mix of inputs:
-    //    a_dense = concat (b_dense, c_sparse, ....)
-    // =>
-    //    a = malloc(shapeOf(a)) or newSparseAllDense(shapeOf(a))
-    //    for i, j, k // dense input
-    //      a[ adjustForOffset(i,j,k) ] = b[i,j,k]
-    //
-    //    for elem in sparse_input
-    //      a[ adjustForOffset(elem.coords) ] = elem.value
-    //    return a
-    Location loc = op.getLoc();
-    const auto dstTp = getSparseTensorType(op);
-    const auto dstEnc = dstTp.getEncoding();
-    const Type elemTp = dstTp.getElementType();
-    const Dimension concatDim = op.getDimension();
-    const Dimension dimRank = dstTp.getDimRank();
-
-    Value dst;         // destination tensor
-    Value dstDimToLvl; // destination tensor permutation (if sparse out)
-    // A pointer to the value being inserted (if dense => sparse)
-    Value elemPtr;
-    // Memory that holds the dim-coords for destination tensor (if sparse out)
-    Value dstDimCoords;
-    // The offset applied to the dimension to be concated (starting from 0)
-    Value offset = constantIndex(rewriter, loc, 0);
-
-    SmallVector<Value> dimSizes;
-    concatDimSizesFromInputs(rewriter, loc, dstTp, op.getInputs(), concatDim,
-                             dimSizes);
-
-    NewCallParams params(rewriter, loc);
-    const bool allDense = dstTp.hasEncoding() && dstTp.isAllDense();
-    Value dstTensor;
-    if (dstTp.hasEncoding()) {
-      // Start a new COO or an initialized annotated all dense sparse tensor.
-      dst = params.genBuffers(dstTp, dimSizes)
-                .genNewCall(allDense ? Action::kEmpty : Action::kEmptyCOO);
-      dstDimCoords = genAlloca(rewriter, loc, dimRank, rewriter.getIndexType());
-      if (allDense) {
-        dstTensor = dst;
-        // Get the values buffer for the sparse tensor and reshape it to the
-        // corresponding dense tensor shape.
-        dst = genValuesCall(rewriter, loc,
-                            MemRefType::get({ShapedType::kDynamic}, elemTp),
-                            {dst});
-        // Pass the `dstDimCoords` buffer for `reshapeValuesToLevels`
-        // to reuse for storing level-sizes (yes, "level-sizes").
-        // This is safe to do because `dstTp` is a dense-tensor type,
-        // and therefore lvlRank == dimRank.
-        dst = reshapeValuesToLevels(rewriter, loc, dstEnc, dimSizes, dst,
-                                    dstDimCoords);
-      } else {
-        dstDimToLvl = params.getDimToLvl();
-        elemPtr = genAllocaScalar(rewriter, loc, elemTp);
-      }
-    } else {
-      // TODO: Dense buffers should be allocated/deallocated via the callback
-      // in BufferizationOptions.
-      dst = allocDenseTensor(rewriter, loc, dstTp, dimSizes);
-    }
-    const Level lvlRank = dstTp.getLvlRank();
-    const auto dcvs2lcvs = [&](ValueRange dcvs) -> SmallVector<Value> {
-      SmallVector<Value> lcvs;
-      lcvs.reserve(lvlRank);
-      for (Level l = 0; l < lvlRank; l++)
-        // FIXME: `toOrigDim` is deprecated
-        lcvs.push_back(dcvs[toOrigDim(dstEnc, l)]);
-      return lcvs;
-    };
-    for (const auto &it : llvm::zip(op.getInputs(), adaptor.getInputs())) {
-      Value orignalOp = std::get<0>(it); // Input (with encoding) from Op
-      Value adaptedOp = std::get<1>(it); // Input (type converted) from adaptor
-      const auto srcTp = getSparseTensorType(orignalOp);
-      if (srcTp.hasEncoding()) {
-        genSparseCOOIterationLoop(
-            rewriter, loc, adaptedOp, srcTp,
-            [&](OpBuilder &builder, Location loc, Value dimCoords,
-                Value elemPtr) -> void {
-              const auto dcvs =
-                  loadAll(builder, loc, dimRank, dimCoords, concatDim, offset);
-              if (dstTp.hasEncoding() && !allDense) {
-                // Case: sparse => sparse, except for annotated all dense.
-                storeAll(builder, loc, dstDimCoords, dcvs);
-                genAddEltCall(builder, loc, elemTp, dst, elemPtr, dstDimCoords,
-                              dstDimToLvl);
-              } else {
-                // Case: sparse => dense, or annotated all dense.
-                const auto lcvs = allDense ? dcvs2lcvs(dcvs) : dcvs;
-                insertScalarIntoDenseTensor(builder, loc, elemPtr, dst, lcvs);
-              }
-            });
-      } else {
-        genDenseTensorIterationLoop(
-            rewriter, loc, adaptedOp, srcTp,
-            [&](OpBuilder &builder, Location loc, ValueRange dcvs) -> void {
-              if (dstTp.hasEncoding() && !allDense) {
-                // Case: dense => sparse, except for annotated all dense.
-                assert(dcvs.size() == static_cast<size_t>(dimRank));
-                storeAll(builder, loc, dstDimCoords, dcvs, concatDim, offset);
-                Value val = genValueForDense(builder, loc, adaptedOp, dcvs);
-                builder.create<memref::StoreOp>(loc, val, elemPtr);
-                genAddEltCall(builder, loc, elemTp, dst, elemPtr, dstDimCoords,
-                              dstDimToLvl);
-              } else {
-                // Case: dense => dense, or annotated all dense.
-                Value val = genValueForDense(builder, loc, adaptedOp, dcvs);
-                // Despite the name, this isn't actually level-cvs until
-                // after the `dcvs2lcvs` call.
-                SmallVector<Value> lcvs(dcvs);
-                // Apply offset.
-                lcvs[concatDim] =
-                    builder.create<arith::AddIOp>(loc, lcvs[concatDim], offset);
-                if (allDense)
-                  lcvs = dcvs2lcvs(lcvs);
-                builder.create<memref::StoreOp>(loc, val, dst, lcvs);
-              }
-            });
-      }
-      // Accumulate offset.
-      // TODO: avoid calling sparseDimSize multiple times by caching the result!
-      Value curDim =
-          createOrFoldDimCall(rewriter, loc, srcTp, adaptedOp, concatDim);
-      offset = rewriter.create<arith::AddIOp>(loc, offset, curDim);
-    }
-    if (!dstTp.hasEncoding()) {
-      rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(
-          op, dstTp.getRankedTensorType(), dst);
-    } else if (allDense) {
-      rewriter.replaceOp(op, dstTensor);
-    } else {
-      // In sparse output case, the destination holds the COO.
-      Value coo = dst;
-      dst = params.genNewCall(Action::kFromCOO, coo);
-      // Release resources.
-      genDelCOOCall(rewriter, loc, elemTp, coo);
-      rewriter.replaceOp(op, dst);
-    }
-    return success();
-  }
-};
-
 /// Sparse conversion rule for the output operator.
 class SparseTensorOutConverter : public OpConversionPattern<OutOp> {
 public:
@@ -1535,19 +1139,16 @@ mlir::SparseTensorTypeToPtrConverter::SparseTensorTypeToPtrConverter() {
 void mlir::populateSparseTensorConversionPatterns(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     const SparseTensorConversionOptions &options) {
-  patterns
-      .add<SparseReturnConverter, SparseTensorToDimSizeConverter,
-           SparseCastConverter, SparseTensorNewConverter,
-           SparseReshapeConverter<tensor::ExpandShapeOp>,
-           SparseReshapeConverter<tensor::CollapseShapeOp>,
-           SparseTensorConcatConverter, SparseTensorAllocConverter,
-           SparseTensorEmptyConverter, SparseTensorDeallocConverter,
-           SparseTensorToPositionsConverter, SparseTensorToCoordinatesConverter,
-           SparseTensorToValuesConverter, SparseNumberOfEntriesConverter,
-           SparseTensorLoadConverter, SparseTensorInsertConverter,
-           SparseTensorExpandConverter, SparseTensorCompressConverter,
-           SparseTensorOutConverter, SparseTensorAssembleConverter>(
-          typeConverter, patterns.getContext());
+  patterns.add<SparseReturnConverter, SparseTensorToDimSizeConverter,
+               SparseCastConverter, SparseTensorNewConverter,
+               SparseTensorAllocConverter, SparseTensorEmptyConverter,
+               SparseTensorDeallocConverter, SparseTensorToPositionsConverter,
+               SparseTensorToCoordinatesConverter,
+               SparseTensorToValuesConverter, SparseNumberOfEntriesConverter,
+               SparseTensorLoadConverter, SparseTensorInsertConverter,
+               SparseTensorExpandConverter, SparseTensorCompressConverter,
+               SparseTensorOutConverter, SparseTensorAssembleConverter>(
+      typeConverter, patterns.getContext());
   patterns.add<SparseTensorConvertConverter>(typeConverter,
                                              patterns.getContext(), options);
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index 7d2f0c7f139cda..f50d3d4606554a 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -207,7 +207,7 @@ struct SparseTensorCodegenPass
     ConversionTarget target(*ctx);
     // Most ops in the sparse dialect must go!
     target.addIllegalDialect<SparseTensorDialect>();
-    target.addLegalOp<SortCooOp>();
+    target.addLegalOp<SortOp>();
     target.addLegalOp<PushBackOp>();
     // Storage specifier outlives sparse tensor pipeline.
     target.addLegalOp<GetStorageSpecifierOp>();
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 277903dc55b743..b0bd22b156cc29 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -1220,9 +1220,9 @@ struct ConvertRewriter : public OpRewritePattern<ConvertOp> {
       assert(xPerm.isPermutation()); // must be a permutation.
 
       Value xs = genToCoordinatesBuffer(rewriter, loc, src);
-      rewriter.create<SortCooOp>(loc, nnz, xs, ValueRange{y}, xPerm,
-                                 rewriter.getIndexAttr(0),
-                                 SparseTensorSortKind::HybridQuickSort);
+      rewriter.create<SortOp>(loc, nnz, xs, ValueRange{y}, xPerm,
+                              rewriter.getIndexAttr(0),
+                              SparseTensorSortKind::HybridQuickSort);
     }
 
     // For each element in the COO tensor, insert the element to the dst tensor.
@@ -1474,17 +1474,17 @@ void mlir::populatePostSparsificationRewriting(RewritePatternSet &patterns,
                                                bool enableRT,
                                                bool enableForeach,
                                                bool enableConvert) {
-  patterns.add<ReshapeRewriter<tensor::ExpandShapeOp>,
-               ReshapeRewriter<tensor::CollapseShapeOp>, TensorReshapeRewriter>(
-      patterns.getContext());
+  patterns.add<ConcatenateRewriter, ReshapeRewriter<tensor::ExpandShapeOp>,
+               ReshapeRewriter<tensor::CollapseShapeOp>,
+               Sparse2SparseReshapeRewriter<tensor::ExpandShapeOp>,
+               Sparse2SparseReshapeRewriter<tensor::CollapseShapeOp>,
+               TensorReshapeRewriter>(patterns.getContext());
   if (enableForeach)
     patterns.add<ForeachRewriter>(patterns.getContext());
+
   // TODO: If RT not enabled, rewrite concatenate ops, etc here.
   if (!enableRT) {
-    patterns.add<ConcatenateRewriter, NewRewriter, OutRewriter,
-                 Sparse2SparseReshapeRewriter<tensor::ExpandShapeOp>,
-                 Sparse2SparseReshapeRewriter<tensor::CollapseShapeOp>>(
-        patterns.getContext());
+    patterns.add<NewRewriter, OutRewriter>(patterns.getContext());
     if (enableConvert)
       patterns.add<ConvertRewriter>(patterns.getContext());
   }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index fee32a5717f62a..e47131bb78318a 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -250,7 +250,7 @@ static bool findAffine(Merger &merger, TensorId tid, Level lvl, AffineExpr a,
     }
 
     if (auto binOp = a.dyn_cast<AffineBinaryOpExpr>()) {
-      // We do not set dim level format for affine expresssion like d0 + d1 on
+      // We do not set dim level format for affine expression like d0 + d1 on
       // either loop index at d0 or d1.
       // We continue the recursion merely to check whether current affine is
       // admissible or not.
@@ -309,7 +309,7 @@ static bool findDepIdxSet(Merger &merger, TensorId tensor, Level lvl,
       if (merger.hasDependentLvl(ldx, tensor)) {
         // TODO: This can be supported by coiterate slices if the loop idx is
         // appeared on affine index for different tensor, or take slice on
-        // mulitple dimensions when it is on the same tensor.
+        // multiple dimensions when it is on the same tensor.
         // E.g.,
         // `d0 + d1` for indexing t0[lvl0] and `d0 + d2` for indexing t1[lvl0]
         // d0_1 = getNextSliceOffset t0 along lvl0
@@ -357,7 +357,7 @@ static bool findDepIdxSet(Merger &merger, TensorId tensor, Level lvl,
 /// indexing-expression is `d0 + d1`)
 static unsigned getNumNonTrivialIdxExpOnSparseLvls(AffineMap map,
                                                    Value tensor) {
-  // The `tensor` is not guaranted to have `RankedTensorType`, therefore
+  // The `tensor` is not guaranteed to have `RankedTensorType`, therefore
   // we can't use `getRankedTensorType`/`getSparseTensorType` here.
   // However, we don't need to handle `StorageSpecifierType`, so we
   // can use `SparseTensorType` once we guard against non-tensors.
@@ -636,7 +636,7 @@ static void addFilterLoopBasedConstraints(CodegenEnv &env, OpOperand &t,
 
   // Each tensor expression and optional dimension ordering (row-major
   // by default) puts an ordering constraint on the loop indices. For
-  // example, the tensor expresion A_ijk forces the ordering i < j < k
+  // example, the tensor expression A_ijk forces the ordering i < j < k
   // on the loop indices if no explicit dimension ordering is given.
   const Level lvlRank = map.getNumResults();
   assert(!enc || lvlRank == enc.getLvlRank());
@@ -668,7 +668,7 @@ static void addFilterLoopBasedConstraints(CodegenEnv &env, OpOperand &t,
 
       // Applying order constraints on every pair of dimExpr between two
       // compound affine expressions can sometime too strict:
-      // E.g, for [dense, dense] -> (d0 + d1, d2 + d3).
+      // E.g., for [dense, dense] -> (d0 + d1, d2 + d3).
       // It is totally fine to have loop sequence d0->d2->d1->d3 instead of
       // requiring d0 < d2, d1 < d2, d0 < d3, d1 < d3.
       // We also relax the affine constraint when use slice-based algorithm
@@ -815,7 +815,7 @@ static bool computeIterationGraph(CodegenEnv &env, SortMask mask,
       const TensorId tid = env.makeTensorId(t.getOperandNumber());
       for (LoopId i = 0; i < numLoops; i++) {
         const auto dltI = env.dlt(tid, i);
-        if (isCompressedDLT(dltI) || isCompressedWithHiDLT(dltI) ||
+        if (isCompressedDLT(dltI) || isLooseCompressedDLT(dltI) ||
             isSingletonDLT(dltI)) {
           for (LoopId j = 0; j < numLoops; j++)
             if (isUndefDLT(env.dlt(tid, j))) {
@@ -1316,7 +1316,7 @@ static void genExpand(CodegenEnv &env, OpBuilder &builder, LoopOrd at,
     return; // not needed at this level
   assert(!env.isReduc());
   // Generate start or end of an expanded access pattern. Note that because
-  // an expension does not rely on the ongoing contents of the sparse storage
+  // an expansion does not rely on the ongoing contents of the sparse storage
   // scheme, we can use the original tensor as incoming SSA value (which
   // simplifies codegen a bit). If expansion on the actual contents is ever
   // needed, we will need to use the SSA value in the insertion chain instead.
@@ -1508,7 +1508,7 @@ static scf::IfOp genIf(CodegenEnv &env, OpBuilder &builder, LoopId ldx,
         assert(ldx == env.merger().loop(b));
         Value clause;
         if (isCompressedDLT(dlt) || isSingletonDLT(dlt) ||
-            isCompressedWithHiDLT(dlt)) {
+            isLooseCompressedDLT(dlt)) {
           assert(lvl.has_value());
           const Value crd = env.emitter().getCoords()[tid][*lvl];
           const Value lvar = env.getLoopVar(ldx);
@@ -1593,7 +1593,7 @@ static bool startLoopSeq(CodegenEnv &env, OpBuilder &builder, ExprId exp,
       needsUniv = true;
     }
     if (isCompressedDLT(dlt) || isSingletonDLT(dlt) ||
-        isCompressedWithHiDLT(dlt) || isIdxReduc) {
+        isLooseCompressedDLT(dlt) || isIdxReduc) {
       // Only when this is a index reduction loop, can the dlt be undefined.
       assert(!isUndefDLT(dlt) || isIdxReduc);
       // sparse/singleton levels, or a dense/sparse index reduction loop.
@@ -2007,9 +2007,9 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
     bool isAdmissible = false;
     bool hasCycle = true;
 
-    // An const list of all masks that we used for interation graph
+    // A const list of all masks that we used for iteration graph
     // computation. Must be ordered from more strict to less strict.
-    // Ideally (though might not be guaranteed), the eariler a constraint mask
+    // Ideally (though might not be guaranteed), the earlier a constraint mask
     // can be satisfied, the faster the generated kernel will be.
     const auto allMasks = {
         SortMask::kIncludeAll,        SortMask::kIncludeDense,
@@ -2038,7 +2038,7 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
     env.startEmit();
     genBuffers(env, rewriter);
     // TODO: Constant affine expression should be handled differently when using
-    // slice-based codegen, it does not matter now becasue we already reject the
+    // slice-based codegen, it does not matter now because we already reject the
     // constant expression at a earlier stage.
     genInitConstantDenseAddress(env, rewriter);
     genStmt(env, rewriter, env.getExprId(), 0);
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 4143efbd0ab28e..bfa2f67eab37a7 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -490,7 +490,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
       const auto dlt = getLvlType(b);
       if (!isCompressedDLT(dlt) && !isSingletonDLT(dlt) &&
-          !isCompressedWithHiDLT(dlt)) {
+          !isLooseCompressedDLT(dlt)) {
         if (reset)
           simple.reset(b);
         reset = true;
@@ -671,7 +671,7 @@ bool Merger::hasAnySparse(const BitVector &bits) const {
   for (TensorLoopId b : bits.set_bits()) {
     const auto dlt = getLvlType(b);
     if (isCompressedDLT(dlt) || isSingletonDLT(dlt) ||
-        isCompressedWithHiDLT(dlt))
+        isLooseCompressedDLT(dlt))
       return true;
   }
   return hasSparseIdxReduction(bits);
@@ -1101,7 +1101,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
     }
   case TensorExp::Kind::kCmpF:
   case TensorExp::Kind::kCmpI:
-    // An comparison operation needs to be performed
+    // A comparison operation needs to be performed
     // for the disjunction of sparse iteration spaces.
     //
     //   x < y |  !y   |   y   |
@@ -1118,7 +1118,7 @@ LatSetId Merger::buildLattices(ExprId e, LoopId i) {
   case TensorExp::Kind::kShlI:
     // A shift operation by an invariant amount (viz. tensor expressions
     // can only occur at the left-hand-side of the operator) can be handled
-    // with the conjuction rule.
+    // with the conjunction rule.
     {
       const ExprId e0 = expr.children.e0;
       const ExprId e1 = expr.children.e1;
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index b08283f0070784..9386d0fd0f04fa 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -119,7 +119,11 @@ struct CollapseShapeOpInterface
                                                     tensor::CollapseShapeOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
-    return false;
+    // tensor.collapse_shape may reallocate, at which point the source buffer is
+    // copied. I.e., there will be a memory read side effect on the bufferized
+    // source. This function conservatively returns "true" because whether a
+    // copy will be created or not is not known at this point.
+    return true;
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
@@ -253,13 +257,12 @@ struct DimOpInterface
   }
 };
 
-/// Bufferization of tensor.empty. This op does not bufferize, but we need an
-/// interface implementation, so that the result of this op is considered
-/// "writable" (default impl. of `isWritable`). Results of ops that do not
-/// implement `BufferizableOpInterface` are not writable.
+/// Bufferization of "tensor.empty". Replace with "bufferization.alloc_tensor".
 struct EmptyOpInterface
     : public BufferizableOpInterface::ExternalModel<EmptyOpInterface,
                                                     tensor::EmptyOp> {
+  bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
+
   bool resultBufferizesToMemoryWrite(Operation *op, OpResult opResult,
                                      const AnalysisState &state) const {
     // The returned tensor does not have specified contents.
@@ -268,17 +271,21 @@ struct EmptyOpInterface
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
+    auto emptyOp = cast<tensor::EmptyOp>(op);
+
+    // Optimization: Fold away the op if it has no uses.
     if (op->getUses().empty()) {
       rewriter.eraseOp(op);
       return success();
     }
 
-    // tensor.empty ops are used to indicate the shape of a tensor. They have
-    // no defined contents and cannot be bufferized. However, they can be
-    // converted to bufferization.alloc_tensor ops, which then bufferize to an
-    // allocation (--empty-tensor-to-alloc-tensor).
-    return op->emitOpError("cannot be bufferized, but can be converted to "
-                           "bufferization.alloc_tensor");
+    // Allocate a tensor. This emits a "bufferization.alloc_tensor" op.
+    FailureOr<Value> allocTensor = allocateTensorForShapedValue(
+        rewriter, op->getLoc(), emptyOp.getResult(), options, /*copy=*/false);
+    if (failed(allocTensor))
+      return failure();
+    rewriter.replaceOp(op, *allocTensor);
+    return success();
   }
 };
 
@@ -288,6 +295,8 @@ struct ExpandShapeOpInterface
                                                     tensor::ExpandShapeOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
+    // In contrast to tensor.collapse_shape, this op can always be bufferized
+    // without a copy.
     return false;
   }
 
@@ -635,11 +644,11 @@ struct InsertSliceOpInterface
     RankedTensorType destType = insertSliceOp.getDestType();
 
     // The source is always read.
-    if (&opOperand == &insertSliceOp.getSourceMutable()[0])
+    if (&opOperand == &insertSliceOp.getSourceMutable())
       return true;
 
     // For the destination, it depends...
-    assert(&opOperand == &insertSliceOp.getDestMutable()[0] && "expected dest");
+    assert(&opOperand == &insertSliceOp.getDestMutable() && "expected dest");
 
     // Dest is not read if it is entirely overwritten. E.g.:
     // tensor.insert_slice %a into %t[0][10][1] : ... into tensor<10xf32>
@@ -838,8 +847,9 @@ struct ReshapeOpInterface
                                                     tensor::ReshapeOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
+    // Depending on the layout map, the source buffer may have to be copied.
     auto reshapeOp = cast<tensor::ReshapeOp>(op);
-    return &opOperand == &reshapeOp.getShapeMutable()[0];
+    return &opOperand == &reshapeOp.getShapeMutable();
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
@@ -867,15 +877,20 @@ struct ReshapeOpInterface
       return failure();
 
     // memref.reshape requires the source buffer to have an identity layout.
-    // If the source memref does not have an identity layout, clone the source
+    // If the source memref does not have an identity layout, copy the source
     // into a new buffer with an identity layout.
     auto srcType = llvm::dyn_cast<MemRefType>(srcBuffer->getType());
     if (srcType && !srcType.getLayout().isIdentity()) {
-      auto identityType =
-          MemRefType::get(srcType.getShape(), srcType.getElementType());
+      FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
+          rewriter, op->getLoc(), reshapeOp.getSource(), options);
+      if (failed(tensorAlloc))
+        return failure();
+      auto memrefType = MemRefType::get(
+          srcType.getShape(), srcType.getElementType(), AffineMap(),
+          cast<BaseMemRefType>(srcBuffer->getType()).getMemorySpace());
       srcBuffer = rewriter
-                      .create<bufferization::CloneOp>(op->getLoc(),
-                                                      identityType, *srcBuffer)
+                      .create<bufferization::ToMemrefOp>(
+                          op->getLoc(), memrefType, *tensorAlloc)
                       .getResult();
     }
 
@@ -916,7 +931,7 @@ struct ParallelInsertSliceOpInterface
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     auto parallelInsertSliceOp = cast<ParallelInsertSliceOp>(op);
-    return &opOperand == &parallelInsertSliceOp.getDestMutable()[0];
+    return &opOperand == &parallelInsertSliceOp.getDestMutable();
   }
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
diff --git a/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
index dff9f64169d495..f4f46d54d78e59 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
@@ -63,7 +63,7 @@ struct InsertSliceOpInterface
     : public SubsetInsertionOpInterface::ExternalModel<InsertSliceOpInterface,
                                                        tensor::InsertSliceOp> {
   OpOperand &getSourceOperand(Operation *op) const {
-    return op->getOpOperand(0);
+    return cast<tensor::InsertSliceOp>(op).getSourceMutable();
   }
 
   bool
@@ -91,11 +91,11 @@ struct ParallelInsertSliceOpInterface
     : public SubsetInsertionOpInterface::ExternalModel<
           ParallelInsertSliceOpInterface, tensor::ParallelInsertSliceOp> {
   OpOperand &getSourceOperand(Operation *op) const {
-    return op->getOpOperand(0);
+    return cast<tensor::ParallelInsertSliceOp>(op).getSourceMutable();
   }
 
   OpOperand &getDestinationOperand(Operation *op) const {
-    return op->getOpOperand(1);
+    return cast<tensor::ParallelInsertSliceOp>(op).getDestMutable();
   }
 
   bool
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 44626260e2f9ef..0e20b379cc2a3e 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -1161,7 +1161,6 @@ void transform::ForeachOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   BlockArgument iterVar = getIterationVariable();
   if (any_of(getBody().front().without_terminator(), [&](Operation &op) {
-
         return isHandleConsumed(iterVar, cast<TransformOpInterface>(&op));
       })) {
     consumesHandle(getTarget(), effects);
@@ -1244,6 +1243,10 @@ transform::GetParentOp::apply(transform::TransformRewriter &rewriter,
       parent = parent->getParentOp();
     }
     if (!parent) {
+      if (getAllowEmptyResults()) {
+        results.set(llvm::cast<OpResult>(getResult()), parents);
+        return DiagnosedSilenceableFailure::success();
+      }
       DiagnosedSilenceableFailure diag =
           emitSilenceableError()
           << "could not find a parent op that matches all requirements";
@@ -1545,6 +1548,21 @@ transform::IncludeOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
       .checkAndReport();
 }
 
+//===----------------------------------------------------------------------===//
+// MatchOperationEmptyOp
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure transform::MatchOperationEmptyOp::matchOperation(
+    ::std::optional<::mlir::Operation *> maybeCurrent,
+    transform::TransformResults &results, transform::TransformState &state) {
+  if (!maybeCurrent.has_value()) {
+    DBGS_MATCHER() << "MatchOperationEmptyOp success\n";
+    return DiagnosedSilenceableFailure::success();
+  }
+  DBGS_MATCHER() << "MatchOperationEmptyOp failure\n";
+  return emitSilenceableError() << "operation is not empty";
+}
+
 //===----------------------------------------------------------------------===//
 // MatchOperationNameOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
index b74e14d377c036..68a735e7ef8e0b 100644
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
@@ -161,17 +161,9 @@ static llvm::raw_ostream &
 printReproCall(llvm::raw_ostream &os, StringRef rootOpName, StringRef passName,
                const Pass::Option<std::string> &debugPayloadRootTag,
                const Pass::Option<std::string> &debugTransformRootTag,
-               const Pass::Option<std::string> &transformLibraryFileName,
                StringRef binaryName) {
-  std::string transformLibraryOption = "";
-  if (!transformLibraryFileName.empty()) {
-    transformLibraryOption =
-        llvm::formatv(" {0}={1}", transformLibraryFileName.getArgStr(),
-                      transformLibraryFileName.getValue())
-            .str();
-  }
   os << llvm::formatv(
-      "{7} --pass-pipeline=\"{0}({1}{{{2}={3} {4}={5}{6}})\"", rootOpName,
+      "{6} --pass-pipeline=\"{0}({1}{{{2}={3} {4}={5}})\"", rootOpName,
       passName, debugPayloadRootTag.getArgStr(),
       debugPayloadRootTag.empty()
           ? StringRef(kTransformDialectTagPayloadRootValue)
@@ -180,14 +172,15 @@ printReproCall(llvm::raw_ostream &os, StringRef rootOpName, StringRef passName,
       debugTransformRootTag.empty()
           ? StringRef(kTransformDialectTagTransformContainerValue)
           : debugTransformRootTag,
-      transformLibraryOption, binaryName);
+      binaryName);
   return os;
 }
 
 /// Prints the module rooted at `root` to `os` and appends
 /// `transformContainer` if it is not nested in `root`.
-llvm::raw_ostream &printModuleForRepro(llvm::raw_ostream &os, Operation *root,
-                                       Operation *transform) {
+static llvm::raw_ostream &printModuleForRepro(llvm::raw_ostream &os,
+                                              Operation *root,
+                                              Operation *transform) {
   root->print(os);
   if (!root->isAncestor(transform))
     transform->print(os);
@@ -196,12 +189,13 @@ llvm::raw_ostream &printModuleForRepro(llvm::raw_ostream &os, Operation *root,
 
 /// Saves the payload and the transform IR into a temporary file and reports
 /// the file name to `os`.
-void saveReproToTempFile(
-    llvm::raw_ostream &os, Operation *target, Operation *transform,
-    StringRef passName, const Pass::Option<std::string> &debugPayloadRootTag,
-    const Pass::Option<std::string> &debugTransformRootTag,
-    const Pass::Option<std::string> &transformLibraryFileName,
-    StringRef binaryName) {
+static void
+saveReproToTempFile(llvm::raw_ostream &os, Operation *target,
+                    Operation *transform, StringRef passName,
+                    const Pass::Option<std::string> &debugPayloadRootTag,
+                    const Pass::Option<std::string> &debugTransformRootTag,
+                    const Pass::Option<std::string> &transformLibraryFileName,
+                    StringRef binaryName) {
   using llvm::sys::fs::TempFile;
   Operation *root = getRootOperation(target);
 
@@ -226,8 +220,7 @@ void saveReproToTempFile(
 
   os << "=== Transform Interpreter Repro ===\n";
   printReproCall(os, root->getName().getStringRef(), passName,
-                 debugPayloadRootTag, debugTransformRootTag,
-                 transformLibraryFileName, binaryName)
+                 debugPayloadRootTag, debugTransformRootTag, binaryName)
       << " " << filename << "\n";
   os << "===================================\n";
 }
@@ -281,8 +274,7 @@ static void performOptionalDebugActions(
     llvm::dbgs() << "=== Transform Interpreter Repro ===\n";
     printReproCall(llvm::dbgs() << "cat <<EOF | ",
                    root->getName().getStringRef(), passName,
-                   debugPayloadRootTag, debugTransformRootTag,
-                   transformLibraryFileName, binaryName)
+                   debugPayloadRootTag, debugTransformRootTag, binaryName)
         << "\n";
     printModuleForRepro(llvm::dbgs(), root, transform);
     llvm::dbgs() << "\nEOF\n";
@@ -302,84 +294,243 @@ static void performOptionalDebugActions(
     transform->removeAttr(kTransformDialectTagAttrName);
 }
 
-/// Replaces external symbols in `block` with their (non-external) definitions
-/// from the given module.
-static LogicalResult defineDeclaredSymbols(Block &block, ModuleOp definitions) {
-  MLIRContext &ctx = *definitions->getContext();
-  auto consumedName =
-      StringAttr::get(&ctx, transform::TransformDialect::kArgConsumedAttrName);
-  auto readOnlyName =
-      StringAttr::get(&ctx, transform::TransformDialect::kArgReadOnlyAttrName);
-
-  for (Operation &op : llvm::make_early_inc_range(block)) {
-    LLVM_DEBUG(DBGS() << op << "\n");
-    auto symbol = dyn_cast<SymbolOpInterface>(op);
-    if (!symbol)
-      continue;
-    if (symbol->getNumRegions() == 1 && !symbol->getRegion(0).empty())
-      continue;
+/// Return whether `func1` can be merged into `func2`. For that to work `func1`
+/// has to be a declaration (aka has to be external) and `func2` either has to
+/// be a declaration as well, or it has to be public (otherwise, it wouldn't
+/// be visible by `func1`).
+static bool canMergeInto(FunctionOpInterface func1, FunctionOpInterface func2) {
+  return func1.isExternal() && (func2.isPublic() || func2.isExternal());
+}
 
-    LLVM_DEBUG(DBGS() << "looking for definition of symbol "
-                      << symbol.getNameAttr() << ":");
-    SymbolTable symbolTable(definitions);
-    Operation *externalSymbol = symbolTable.lookup(symbol.getNameAttr());
-    if (!externalSymbol || externalSymbol->getNumRegions() != 1 ||
-        externalSymbol->getRegion(0).empty()) {
-      LLVM_DEBUG(llvm::dbgs() << "not found\n");
-      continue;
-    }
+/// Merge `func1` into `func2`. The two ops must be inside the same parent op
+/// and mergable according to `canMergeInto`. The function erases `func1` such
+/// that only `func2` exists when the function returns.
+static LogicalResult mergeInto(FunctionOpInterface func1,
+                               FunctionOpInterface func2) {
+  assert(canMergeInto(func1, func2));
+  assert(func1->getParentOp() == func2->getParentOp() &&
+         "expected func1 and func2 to be in the same parent op");
+
+  // Check that function signatures match.
+  if (func1.getFunctionType() != func2.getFunctionType()) {
+    return func1.emitError()
+           << "external definition has a mismatching signature ("
+           << func2.getFunctionType() << ")";
+  }
 
-    auto symbolFunc = dyn_cast<FunctionOpInterface>(op);
-    auto externalSymbolFunc = dyn_cast<FunctionOpInterface>(externalSymbol);
-    if (!symbolFunc || !externalSymbolFunc) {
-      LLVM_DEBUG(llvm::dbgs() << "cannot compare types\n");
+  // Check and merge argument attributes.
+  MLIRContext *context = func1->getContext();
+  auto *td = context->getLoadedDialect<transform::TransformDialect>();
+  StringAttr consumedName = td->getConsumedAttrName();
+  StringAttr readOnlyName = td->getReadOnlyAttrName();
+  for (unsigned i = 0, e = func1.getNumArguments(); i < e; ++i) {
+    bool isExternalConsumed = func2.getArgAttr(i, consumedName) != nullptr;
+    bool isExternalReadonly = func2.getArgAttr(i, readOnlyName) != nullptr;
+    bool isConsumed = func1.getArgAttr(i, consumedName) != nullptr;
+    bool isReadonly = func1.getArgAttr(i, readOnlyName) != nullptr;
+    if (!isExternalConsumed && !isExternalReadonly) {
+      if (isConsumed)
+        func2.setArgAttr(i, consumedName, UnitAttr::get(context));
+      else if (isReadonly)
+        func2.setArgAttr(i, readOnlyName, UnitAttr::get(context));
       continue;
     }
 
-    LLVM_DEBUG(llvm::dbgs() << "found @" << externalSymbol << "\n");
-    if (symbolFunc.getFunctionType() != externalSymbolFunc.getFunctionType()) {
-      return symbolFunc.emitError()
-             << "external definition has a mismatching signature ("
-             << externalSymbolFunc.getFunctionType() << ")";
+    if ((isExternalConsumed && !isConsumed) ||
+        (isExternalReadonly && !isReadonly)) {
+      return func1.emitError()
+             << "external definition has mismatching consumption "
+                "annotations for argument #"
+             << i;
     }
+  }
+
+  // `func1` is the external one, so we can remove it.
+  assert(func1.isExternal());
+  func1->erase();
 
-    for (unsigned i = 0, e = symbolFunc.getNumArguments(); i < e; ++i) {
-      bool isExternalConsumed =
-          externalSymbolFunc.getArgAttr(i, consumedName) != nullptr;
-      bool isExternalReadonly =
-          externalSymbolFunc.getArgAttr(i, readOnlyName) != nullptr;
-      bool isConsumed = symbolFunc.getArgAttr(i, consumedName) != nullptr;
-      bool isReadonly = symbolFunc.getArgAttr(i, readOnlyName) != nullptr;
-      if (!isExternalConsumed && !isExternalReadonly) {
-        if (isConsumed)
-          externalSymbolFunc.setArgAttr(i, consumedName, UnitAttr::get(&ctx));
-        else if (isReadonly)
-          externalSymbolFunc.setArgAttr(i, readOnlyName, UnitAttr::get(&ctx));
+  return success();
+}
+
+/// Merge all symbols from `other` into `target`. Both ops need to implement the
+/// `SymbolTable` trait. Operations are moved from `other`, i.e., `other` may be
+/// modified by this function and might not verify after the function returns.
+/// Upon merging, private symbols may be renamed in order to avoid collisions in
+/// the result. Public symbols may not collide, with the exception of
+/// instances of `SymbolOpInterface`, where collisions are allowed if at least
+/// one of the two is external, in which case the other op preserved (or any one
+/// of the two if both are external).
+// TODO: Reconsider cloning individual ops rather than forcing users of the
+//       function to clone (or move) `other` in order to improve efficiency.
+//       This might primarily make sense if we can also prune the symbols that
+//       are merged to a subset (such as those that are actually used).
+static LogicalResult mergeSymbolsInto(Operation *target,
+                                      OwningOpRef<Operation *> other) {
+  assert(target->hasTrait<OpTrait::SymbolTable>() &&
+         "requires target to implement the 'SymbolTable' trait");
+  assert(other->hasTrait<OpTrait::SymbolTable>() &&
+         "requires target to implement the 'SymbolTable' trait");
+
+  SymbolTable targetSymbolTable(target);
+  SymbolTable otherSymbolTable(*other);
+
+  // Step 1:
+  //
+  // Rename private symbols in both ops in order to resolve conflicts that can
+  // be resolved that way.
+  LLVM_DEBUG(DBGS() << "renaming private symbols to resolve conflicts:\n");
+  // TODO: Do we *actually* need to test in both directions?
+  for (auto &&[symbolTable, otherSymbolTable] : llvm::zip(
+           SmallVector<SymbolTable *, 2>{&targetSymbolTable, &otherSymbolTable},
+           SmallVector<SymbolTable *, 2>{&otherSymbolTable,
+                                         &targetSymbolTable})) {
+    Operation *symbolTableOp = symbolTable->getOp();
+    for (Operation &op : symbolTableOp->getRegion(0).front()) {
+      auto symbolOp = dyn_cast<SymbolOpInterface>(op);
+      if (!symbolOp)
         continue;
+      StringAttr name = symbolOp.getNameAttr();
+      LLVM_DEBUG(DBGS() << "  found @" << name.getValue() << "\n");
+
+      // Check if there is a colliding op in the other module.
+      auto collidingOp =
+          cast_or_null<SymbolOpInterface>(otherSymbolTable->lookup(name));
+      if (!collidingOp)
+        continue;
+
+      LLVM_DEBUG(DBGS() << "    collision found for @" << name.getValue());
+
+      // Collisions are fine if both opt are functions and can be merged.
+      if (auto funcOp = dyn_cast<FunctionOpInterface>(op),
+          collidingFuncOp =
+              dyn_cast<FunctionOpInterface>(collidingOp.getOperation());
+          funcOp && collidingFuncOp) {
+        if (canMergeInto(funcOp, collidingFuncOp) ||
+            canMergeInto(collidingFuncOp, funcOp)) {
+          LLVM_DEBUG(llvm::dbgs() << " but both ops are functions and "
+                                     "will be merged\n");
+          continue;
+        }
+
+        // If they can't be merged, proceed like any other collision.
+        LLVM_DEBUG(llvm::dbgs() << " and both ops are function definitions");
       }
 
-      if ((isExternalConsumed && !isConsumed) ||
-          (isExternalReadonly && !isReadonly)) {
-        return symbolFunc.emitError()
-               << "external definition has mismatching consumption annotations "
-                  "for argument #"
-               << i;
+      // Collision can be resolved by renaming if one of the ops is private.
+      auto renameToUnique =
+          [&](SymbolOpInterface op, SymbolOpInterface otherOp,
+              SymbolTable &symbolTable,
+              SymbolTable &otherSymbolTable) -> LogicalResult {
+        LLVM_DEBUG(llvm::dbgs() << ", renaming\n");
+        FailureOr<StringAttr> maybeNewName =
+            symbolTable.renameToUnique(op, {&otherSymbolTable});
+        if (failed(maybeNewName)) {
+          InFlightDiagnostic diag = op->emitError("failed to rename symbol");
+          diag.attachNote(otherOp->getLoc())
+              << "attempted renaming due to collision with this op";
+          return diag;
+        }
+        LLVM_DEBUG(DBGS() << "      renamed to @" << maybeNewName->getValue()
+                          << "\n");
+        return success();
+      };
+
+      if (symbolOp.isPrivate()) {
+        if (failed(renameToUnique(symbolOp, collidingOp, *symbolTable,
+                                  *otherSymbolTable)))
+          return failure();
+        continue;
       }
+      if (collidingOp.isPrivate()) {
+        if (failed(renameToUnique(collidingOp, symbolOp, *otherSymbolTable,
+                                  *symbolTable)))
+          return failure();
+        continue;
+      }
+
+      LLVM_DEBUG(llvm::dbgs() << ", emitting error\n");
+      InFlightDiagnostic diag = symbolOp.emitError()
+                                << "doubly defined symbol @" << name.getValue();
+      diag.attachNote(collidingOp->getLoc()) << "previously defined here";
+      return diag;
     }
+  }
+
+  // TODO: This duplicates pass infrastructure. We should split this pass into
+  //       several and let the pass infrastructure do the verification.
+  for (auto *op : SmallVector<Operation *>{target, *other}) {
+    if (failed(mlir::verify(op)))
+      return op->emitError() << "failed to verify input op after renaming";
+  }
+
+  // Step 2:
+  //
+  // Move all ops from `other` into target and merge public symbols.
+  LLVM_DEBUG(DBGS() << "moving all symbols into target\n");
+  {
+    SmallVector<SymbolOpInterface> opsToMove;
+    for (Operation &op : other->getRegion(0).front()) {
+      if (auto symbol = dyn_cast<SymbolOpInterface>(op))
+        opsToMove.push_back(symbol);
+    }
+
+    for (SymbolOpInterface op : opsToMove) {
+      // Remember potentially colliding op in the target module.
+      auto collidingOp = cast_or_null<SymbolOpInterface>(
+          targetSymbolTable.lookup(op.getNameAttr()));
+
+      // Move op even if we get a collision.
+      LLVM_DEBUG(DBGS() << "  moving @" << op.getName());
+      op->moveBefore(&target->getRegion(0).front(),
+                     target->getRegion(0).front().end());
+
+      // If there is no collision, we are done.
+      if (!collidingOp) {
+        LLVM_DEBUG(llvm::dbgs() << " without collision\n");
+        continue;
+      }
+
+      // The two colliding ops must both be functions because we have already
+      // emitted errors otherwise earlier.
+      auto funcOp = cast<FunctionOpInterface>(op.getOperation());
+      auto collidingFuncOp =
+          cast<FunctionOpInterface>(collidingOp.getOperation());
+
+      // Both ops are in the target module now and can be treated symmetrically,
+      // so w.l.o.g. we can reduce to merging `funcOp` into `collidingFuncOp`.
+      if (!canMergeInto(funcOp, collidingFuncOp)) {
+        std::swap(funcOp, collidingFuncOp);
+      }
+      assert(canMergeInto(funcOp, collidingFuncOp));
+
+      LLVM_DEBUG(llvm::dbgs() << " with collision, trying to keep op at "
+                              << collidingFuncOp.getLoc() << ":\n"
+                              << collidingFuncOp << "\n");
 
-    OpBuilder builder(&op);
-    builder.setInsertionPoint(&op);
-    builder.clone(*externalSymbol);
-    symbol->erase();
+      // Update symbol table. This works with or without the previous `swap`.
+      targetSymbolTable.remove(funcOp);
+      targetSymbolTable.insert(collidingFuncOp);
+      assert(targetSymbolTable.lookup(funcOp.getName()) == collidingFuncOp);
+
+      // Do the actual merging.
+      if (failed(mergeInto(funcOp, collidingFuncOp))) {
+        return failure();
+      }
+    }
   }
 
+  if (failed(mlir::verify(target)))
+    return target->emitError()
+           << "failed to verify target op after merging symbols";
+
+  LLVM_DEBUG(DBGS() << "done merging ops\n");
   return success();
 }
 
 LogicalResult transform::detail::interpreterBaseRunOnOperationImpl(
     Operation *target, StringRef passName,
     const std::shared_ptr<OwningOpRef<ModuleOp>> &sharedTransformModule,
-    const std::shared_ptr<OwningOpRef<ModuleOp>> &libraryModule,
+    const std::shared_ptr<OwningOpRef<ModuleOp>> &transformLibraryModule,
     const RaggedArray<MappedValue> &extraMappings,
     const TransformOptions &options,
     const Pass::Option<std::string> &transformFileName,
@@ -387,6 +538,12 @@ LogicalResult transform::detail::interpreterBaseRunOnOperationImpl(
     const Pass::Option<std::string> &debugPayloadRootTag,
     const Pass::Option<std::string> &debugTransformRootTag,
     StringRef binaryName) {
+  bool hasSharedTransformModule =
+      sharedTransformModule && *sharedTransformModule;
+  bool hasTransformLibraryModule =
+      transformLibraryModule && *transformLibraryModule;
+  assert((!hasSharedTransformModule || !hasTransformLibraryModule) &&
+         "at most one of shared or library transform module can be set");
 
   // Step 1
   // ------
@@ -407,9 +564,8 @@ LogicalResult transform::detail::interpreterBaseRunOnOperationImpl(
   // transform is embedded in the payload IR. If debugTransformRootTag was
   // passed, then we are in user-specified selection of the transforming IR.
   // This corresponds to REPL debug mode.
-  bool sharedTransform = (sharedTransformModule && *sharedTransformModule);
   Operation *transformContainer =
-      sharedTransform ? sharedTransformModule->get() : target;
+      hasSharedTransformModule ? sharedTransformModule->get() : target;
   Operation *transformRoot =
       debugTransformRootTag.empty()
           ? findTopLevelTransform(transformContainer,
@@ -430,7 +586,7 @@ LogicalResult transform::detail::interpreterBaseRunOnOperationImpl(
   // Copy external defintions for symbols if provided. Be aware of potential
   // concurrent execution (normally, the error shouldn't be triggered unless the
   // transform IR modifies itself in a pass, which is also forbidden elsewhere).
-  if (!sharedTransform && libraryModule && *libraryModule) {
+  if (hasTransformLibraryModule) {
     if (!target->isProperAncestor(transformRoot)) {
       InFlightDiagnostic diag =
           transformRoot->emitError()
@@ -438,8 +594,9 @@ LogicalResult transform::detail::interpreterBaseRunOnOperationImpl(
       diag.attachNote(target->getLoc()) << "pass anchor op";
       return diag;
     }
-    if (failed(defineDeclaredSymbols(*transformRoot->getBlock(),
-                                     libraryModule->get())))
+    if (failed(
+            mergeSymbolsInto(SymbolTable::getNearestSymbolTable(transformRoot),
+                             transformLibraryModule->get()->clone())))
       return failure();
   }
 
@@ -461,25 +618,27 @@ LogicalResult transform::detail::interpreterBaseRunOnOperationImpl(
 LogicalResult transform::detail::interpreterBaseInitializeImpl(
     MLIRContext *context, StringRef transformFileName,
     StringRef transformLibraryFileName,
-    std::shared_ptr<OwningOpRef<ModuleOp>> &module,
-    std::shared_ptr<OwningOpRef<ModuleOp>> &libraryModule,
+    std::shared_ptr<OwningOpRef<ModuleOp>> &sharedTransformModule,
+    std::shared_ptr<OwningOpRef<ModuleOp>> &transformLibraryModule,
     function_ref<std::optional<LogicalResult>(OpBuilder &, Location)>
         moduleBuilder) {
-  OwningOpRef<ModuleOp> parsed;
-  if (failed(parseTransformModuleFromFile(context, transformFileName, parsed)))
+  OwningOpRef<ModuleOp> parsedTransformModule;
+  if (failed(parseTransformModuleFromFile(context, transformFileName,
+                                          parsedTransformModule)))
     return failure();
-  if (parsed && failed(mlir::verify(*parsed)))
+  if (parsedTransformModule && failed(mlir::verify(*parsedTransformModule)))
     return failure();
 
-  OwningOpRef<ModuleOp> parsedLibrary;
+  OwningOpRef<ModuleOp> parsedLibraryModule;
   if (failed(parseTransformModuleFromFile(context, transformLibraryFileName,
-                                          parsedLibrary)))
+                                          parsedLibraryModule)))
     return failure();
-  if (parsedLibrary && failed(mlir::verify(*parsedLibrary)))
+  if (parsedLibraryModule && failed(mlir::verify(*parsedLibraryModule)))
     return failure();
 
-  if (parsed) {
-    module = std::make_shared<OwningOpRef<ModuleOp>>(std::move(parsed));
+  if (parsedTransformModule) {
+    sharedTransformModule = std::make_shared<OwningOpRef<ModuleOp>>(
+        std::move(parsedTransformModule));
   } else if (moduleBuilder) {
     // TODO: better location story.
     auto location = UnknownLoc::get(context);
@@ -491,20 +650,20 @@ LogicalResult transform::detail::interpreterBaseInitializeImpl(
     if (std::optional<LogicalResult> result = moduleBuilder(b, location)) {
       if (failed(*result))
         return failure();
-      module = std::move(localModule);
+      sharedTransformModule = std::move(localModule);
     }
   }
 
-  if (!parsedLibrary || !*parsedLibrary)
+  if (!parsedLibraryModule || !*parsedLibraryModule)
     return success();
 
-  if (module && *module) {
-    if (failed(defineDeclaredSymbols(*module->get().getBody(),
-                                     parsedLibrary.get())))
+  if (sharedTransformModule && *sharedTransformModule) {
+    if (failed(mergeSymbolsInto(sharedTransformModule->get(),
+                                std::move(parsedLibraryModule))))
       return failure();
   } else {
-    libraryModule =
-        std::make_shared<OwningOpRef<ModuleOp>>(std::move(parsedLibrary));
+    transformLibraryModule =
+        std::make_shared<OwningOpRef<ModuleOp>>(std::move(parsedLibraryModule));
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 77076090f354e6..16f9d7e4d57eef 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -912,6 +912,15 @@ struct ReorderElementwiseOpsOnBroadcast final
       return failure();
     if (!OpTrait::hasElementwiseMappableTraits(op))
       return failure();
+    if (op->getNumOperands() == 0 ||
+        op->getResults()[0].getType() != op->getOperand(0).getType()) {
+      return failure();
+    }
+    // Avoid operations that only accept vector types, since broadcast
+    // source might be scalar types.
+    if (isa<vector::FMAOp>(op)) {
+      return failure();
+    }
 
     // Get the type of the lhs operand
     auto *lhsBcastOrSplat = op->getOperand(0).getDefiningOp();
@@ -1447,8 +1456,8 @@ void mlir::vector::
 
 void mlir::vector::populateSinkVectorBroadcastPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<ReorderElementwiseOpsOnBroadcast>(patterns.getContext(),
-                                                 benefit);
+  patterns.add<ReorderCastOpsOnBroadcast, ReorderElementwiseOpsOnBroadcast>(
+      patterns.getContext(), benefit);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/File.cpp b/mlir/lib/ExecutionEngine/SparseTensor/File.cpp
index bb227bf2c2dd43..c49ec0998fb444 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/File.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/File.cpp
@@ -1,4 +1,4 @@
-//===- File.cpp - Parsing sparse tensors from files -----------------------===//
+//===- File.cpp - Reading/writing sparse tensors from/to files ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,20 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements parsing and printing of files in one of the
-// following external formats:
-//
-// (1) Matrix Market Exchange (MME): *.mtx
-//     https://math.nist.gov/MatrixMarket/formats.html
-//
-// (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns
-//     http://frostt.io/tensors/file-formats.html
-//
-// This file is part of the lightweight runtime support library for sparse
-// tensor manipulations.  The functionality of the support library is meant
-// to simplify benchmarking, testing, and debugging MLIR code operating on
-// sparse tensors.  However, the provided functionality is **not** part of
-// core MLIR itself.
+// This file implements reading and writing sparse tensor files.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/NNZ.cpp b/mlir/lib/ExecutionEngine/SparseTensor/NNZ.cpp
index c6fd669ad513f4..d3c3951c15468d 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/NNZ.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/NNZ.cpp
@@ -8,19 +8,12 @@
 //
 // This file contains method definitions for `SparseTensorNNZ`.
 //
-// This file is part of the lightweight runtime support library for sparse
-// tensor manipulations.  The functionality of the support library is meant
-// to simplify benchmarking, testing, and debugging MLIR code operating on
-// sparse tensors.  However, the provided functionality is **not** part of
-// core MLIR itself.
-//
 //===----------------------------------------------------------------------===//
 
 #include "mlir/ExecutionEngine/SparseTensor/Storage.h"
 
 using namespace mlir::sparse_tensor;
 
-//===----------------------------------------------------------------------===//
 SparseTensorNNZ::SparseTensorNNZ(const std::vector<uint64_t> &lvlSizes,
                                  const std::vector<DimLevelType> &lvlTypes)
     : lvlSizes(lvlSizes), lvlTypes(lvlTypes), nnz(getLvlRank()) {
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
index 2c4f0123ed4417..77861e074e9333 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
@@ -9,14 +9,7 @@
 // This file contains method definitions for `SparseTensorStorageBase`.
 // In particular we want to ensure that the default implementations of
 // the "partial method specialization" trick aren't inline (since there's
-// no benefit).  Though this also helps ensure that we avoid weak-vtables:
-// <https://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers>
-//
-// This file is part of the lightweight runtime support library for sparse
-// tensor manipulations.  The functionality of the support library is meant
-// to simplify benchmarking, testing, and debugging MLIR code operating on
-// sparse tensors.  However, the provided functionality is **not** part of
-// core MLIR itself.
+// no benefit).
 //
 //===----------------------------------------------------------------------===//
 
@@ -32,10 +25,6 @@ SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT
       lvlSizes(lvlSizes, lvlSizes + lvlRank),
       lvlTypes(lvlTypes, lvlTypes + lvlRank),
       lvl2dim(lvl2dim, lvl2dim + lvlRank) {
-  // TODO: If we do get any nullptrs, I'm pretty sure these assertions
-  // will run too late (i.e., after copying things into vectors above).
-  // But since those fields are const I'm not sure there's any clean way
-  // to assert things before copying...
   assert(dimSizes && "Got nullptr for dimension sizes");
   assert(lvlSizes && "Got nullptr for level sizes");
   assert(lvlTypes && "Got nullptr for level types");
@@ -44,18 +33,15 @@ SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT
   assert(dimRank > 0 && "Trivial shape is unsupported");
   for (uint64_t d = 0; d < dimRank; ++d)
     assert(dimSizes[d] > 0 && "Dimension size zero has trivial storage");
-  // Validate level-indexed parameters.
+  // Validate lvl-indexed parameters.
   assert(lvlRank > 0 && "Trivial shape is unsupported");
   for (uint64_t l = 0; l < lvlRank; ++l) {
     assert(lvlSizes[l] > 0 && "Level size zero has trivial storage");
-    const auto dlt = lvlTypes[l]; // Avoid redundant bounds checking.
-    // We use `MLIR_SPARSETENSOR_FATAL` here instead of `assert` so that
-    // when this ctor is successful then all the methods can rely on the
-    // fact that each level-type satisfies one of these options (even
-    // when `NDEBUG` is true), thereby reducing the need to re-assert things.
-    if (!(isDenseDLT(dlt) || isCompressedDLT(dlt) || isSingletonDLT(dlt)))
+    const auto dlt = lvlTypes[l];
+    if (!(isDenseDLT(dlt) || isCompressedDLT(dlt) || isSingletonDLT(dlt))) {
       MLIR_SPARSETENSOR_FATAL("unsupported level type: %d\n",
                               static_cast<uint8_t>(dlt));
+    }
   }
 }
 
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index d8f2854f4efe9d..82e1e96229b79e 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -2413,8 +2413,7 @@ void AsmPrinter::Impl::printDenseIntOrFPElementsAttr(
   if (!attr.isSplat() && allowHex &&
       shouldPrintElementsAttrWithHex(numElements)) {
     ArrayRef<char> rawData = attr.getRawData();
-    if (llvm::support::endian::system_endianness() ==
-        llvm::support::endianness::big) {
+    if (llvm::endianness::native == llvm::endianness::big) {
       // Convert endianess in big-endian(BE) machines. `rawData` is BE in BE
       // machines. It is converted here to print in LE format.
       SmallVector<char, 64> outDataVec(rawData.size());
diff --git a/mlir/lib/IR/BuiltinAttributes.cpp b/mlir/lib/IR/BuiltinAttributes.cpp
index 5f1129326f4f77..64949a17107297 100644
--- a/mlir/lib/IR/BuiltinAttributes.cpp
+++ b/mlir/lib/IR/BuiltinAttributes.cpp
@@ -467,8 +467,7 @@ static bool getBit(const char *rawData, size_t bitPos) {
 /// BE format.
 static void copyAPIntToArrayForBEmachine(APInt value, size_t numBytes,
                                          char *result) {
-  assert(llvm::support::endian::system_endianness() == // NOLINT
-         llvm::support::endianness::big);              // NOLINT
+  assert(llvm::endianness::native == llvm::endianness::big);
   assert(value.getNumWords() * APInt::APINT_WORD_SIZE >= numBytes);
 
   // Copy the words filled with data.
@@ -497,8 +496,7 @@ static void copyAPIntToArrayForBEmachine(APInt value, size_t numBytes,
 /// format.
 static void copyArrayToAPIntForBEmachine(const char *inArray, size_t numBytes,
                                          APInt &result) {
-  assert(llvm::support::endian::system_endianness() == // NOLINT
-         llvm::support::endianness::big);              // NOLINT
+  assert(llvm::endianness::native == llvm::endianness::big);
   assert(result.getNumWords() * APInt::APINT_WORD_SIZE >= numBytes);
 
   // Copy the data that fills the word of `result` from `inArray`.
@@ -539,8 +537,7 @@ static void writeBits(char *rawData, size_t bitPos, APInt value) {
 
   // Otherwise, the bit position is guaranteed to be byte aligned.
   assert((bitPos % CHAR_BIT) == 0 && "expected bitPos to be 8-bit aligned");
-  if (llvm::support::endian::system_endianness() ==
-      llvm::support::endianness::big) {
+  if (llvm::endianness::native == llvm::endianness::big) {
     // Copy from `value` to `rawData + (bitPos / CHAR_BIT)`.
     // Copying the first `llvm::divideCeil(bitWidth, CHAR_BIT)` bytes doesn't
     // work correctly in BE format.
@@ -565,8 +562,7 @@ static APInt readBits(const char *rawData, size_t bitPos, size_t bitWidth) {
   // Otherwise, the bit position must be 8-bit aligned.
   assert((bitPos % CHAR_BIT) == 0 && "expected bitPos to be 8-bit aligned");
   APInt result(bitWidth, 0);
-  if (llvm::support::endian::system_endianness() ==
-      llvm::support::endianness::big) {
+  if (llvm::endianness::native == llvm::endianness::big) {
     // Copy from `rawData + (bitPos / CHAR_BIT)` to `result`.
     // Copying the first `llvm::divideCeil(bitWidth, CHAR_BIT)` bytes doesn't
     // work correctly in BE format.
@@ -1367,8 +1363,7 @@ void DenseIntOrFPElementsAttr::convertEndianOfCharForBEmachine(
   using llvm::support::ulittle32_t;
   using llvm::support::ulittle64_t;
 
-  assert(llvm::support::endian::system_endianness() == // NOLINT
-         llvm::support::endianness::big);              // NOLINT
+  assert(llvm::endianness::native == llvm::endianness::big);
   // NOLINT to avoid warning message about replacing by static_assert()
 
   // Following std::copy_n always converts endianness on BE machine.
diff --git a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp
index 5aac72fcb062c3..6726b49dd3d310 100644
--- a/mlir/lib/IR/OperationSupport.cpp
+++ b/mlir/lib/IR/OperationSupport.cpp
@@ -437,6 +437,12 @@ MutableOperandRange::MutableOperandRange(
 MutableOperandRange::MutableOperandRange(Operation *owner)
     : MutableOperandRange(owner, /*start=*/0, owner->getNumOperands()) {}
 
+/// Construct a new mutable range for the given OpOperand.
+MutableOperandRange::MutableOperandRange(OpOperand &opOperand)
+    : MutableOperandRange(opOperand.getOwner(),
+                          /*start=*/opOperand.getOperandNumber(),
+                          /*length=*/1) {}
+
 /// Slice this range into a sub range, with the additional operand segment.
 MutableOperandRange
 MutableOperandRange::slice(unsigned subStart, unsigned subLen,
diff --git a/mlir/lib/IR/SymbolTable.cpp b/mlir/lib/IR/SymbolTable.cpp
index 2494cb7086f0d7..727bd6b1839618 100644
--- a/mlir/lib/IR/SymbolTable.cpp
+++ b/mlir/lib/IR/SymbolTable.cpp
@@ -218,6 +218,80 @@ StringAttr SymbolTable::insert(Operation *symbol, Block::iterator insertPt) {
   return getSymbolName(symbol);
 }
 
+LogicalResult SymbolTable::rename(StringAttr from, StringAttr to) {
+  Operation *op = lookup(from);
+  return rename(op, to);
+}
+
+LogicalResult SymbolTable::rename(Operation *op, StringAttr to) {
+  StringAttr from = getNameIfSymbol(op);
+  (void)from;
+
+  assert(from && "expected valid 'name' attribute");
+  assert(op->getParentOp() == symbolTableOp &&
+         "expected this operation to be inside of the operation with this "
+         "SymbolTable");
+  assert(lookup(from) == op && "current name does not resolve to op");
+  assert(lookup(to) == nullptr && "new name already exists");
+
+  if (failed(SymbolTable::replaceAllSymbolUses(op, to, getOp())))
+    return failure();
+
+  // Remove op with old name, change name, add with new name. The order is
+  // important here due to how `remove` and `insert` rely on the op name.
+  remove(op);
+  setSymbolName(op, to);
+  insert(op);
+
+  assert(lookup(to) == op && "new name does not resolve to renamed op");
+  assert(lookup(from) == nullptr && "old name still exists");
+
+  return success();
+}
+
+LogicalResult SymbolTable::rename(StringAttr from, StringRef to) {
+  auto toAttr = StringAttr::get(getOp()->getContext(), to);
+  return rename(from, toAttr);
+}
+
+LogicalResult SymbolTable::rename(Operation *op, StringRef to) {
+  auto toAttr = StringAttr::get(getOp()->getContext(), to);
+  return rename(op, toAttr);
+}
+
+FailureOr<StringAttr>
+SymbolTable::renameToUnique(StringAttr oldName,
+                            ArrayRef<SymbolTable *> others) {
+
+  // Determine new name that is unique in all symbol tables.
+  StringAttr newName;
+  {
+    MLIRContext *context = oldName.getContext();
+    SmallString<64> prefix = oldName.getValue();
+    int uniqueId = 0;
+    prefix.push_back('_');
+    while (true) {
+      newName = StringAttr::get(context, prefix + Twine(uniqueId++));
+      auto lookupNewName = [&](SymbolTable *st) { return st->lookup(newName); };
+      if (!lookupNewName(this) && llvm::none_of(others, lookupNewName)) {
+        break;
+      }
+    }
+  }
+
+  // Apply renaming.
+  if (failed(rename(oldName, newName)))
+    return failure();
+  return newName;
+}
+
+FailureOr<StringAttr>
+SymbolTable::renameToUnique(Operation *op, ArrayRef<SymbolTable *> others) {
+  StringAttr from = getNameIfSymbol(op);
+  assert(from && "expected valid 'name' attribute");
+  return renameToUnique(from, others);
+}
+
 /// Returns the name of the given symbol operation.
 StringAttr SymbolTable::getSymbolName(Operation *symbol) {
   StringAttr name = getNameIfSymbol(symbol);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ArmSME/ArmSMEToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ArmSME/ArmSMEToLLVMIRTranslation.cpp
index 1b57b9979af28a..e6ee41188d594a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ArmSME/ArmSMEToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ArmSME/ArmSMEToLLVMIRTranslation.cpp
@@ -35,7 +35,8 @@ class ArmSMEDialectLLVMIRTranslationInterface
   convertOperation(Operation *op, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) const final {
     Operation &opInst = *op;
-#include "mlir/Dialect/ArmSME/IR/ArmSMEConversions.inc"
+#include "mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicConversions.inc"
+#include "mlir/Dialect/ArmSME/IR/ArmSMEOpsConversions.inc"
 
     return failure();
   }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8f7f1963b3e5a4..27e22c3cef430c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -661,6 +661,31 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
   return bodyGenStatus;
 }
 
+// Convert an OpenMP Teams construct to LLVM IR using OpenMPIRBuilder
+static LogicalResult
+convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
+                LLVM::ModuleTranslation &moduleTranslation) {
+  using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  LogicalResult bodyGenStatus = success();
+  if (op.getNumTeamsLower() || op.getNumTeamsUpper() || op.getIfExpr() ||
+      op.getThreadLimit() || !op.getAllocatorsVars().empty() ||
+      op.getReductions()) {
+    return op.emitError("unhandled clauses for translation to LLVM IR");
+  }
+  auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
+    LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+        moduleTranslation, allocaIP);
+    builder.restoreIP(codegenIP);
+    convertOmpOpRegions(op.getRegion(), "omp.teams.region", builder,
+                        moduleTranslation, bodyGenStatus);
+  };
+
+  llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+  builder.restoreIP(
+      moduleTranslation.getOpenMPBuilder()->createTeams(ompLoc, bodyCB));
+  return bodyGenStatus;
+}
+
 /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
@@ -1989,15 +2014,6 @@ handleDeclareTargetMapVar(llvm::ArrayRef<Value> mapOperands,
           user->replaceUsesOfWith(mapOpValue, load);
         }
       }
-
-      // A global has already been generated by this stage for device
-      // that's now dead after the remapping, we can remove it now,
-      // as we have replaced all usages with the new ref pointer.
-      if (llvm::GlobalValue *unusedGlobalVal =
-              dyn_cast<llvm::GlobalValue>(mapOpValue)) {
-        unusedGlobalVal->dropAllReferences();
-        unusedGlobalVal->eraseFromParent();
-      }
     }
   }
 }
@@ -2406,6 +2422,9 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::SingleOp op) {
         return convertOmpSingle(op, builder, moduleTranslation);
       })
+      .Case([&](omp::TeamsOp op) {
+        return convertOmpTeams(op, builder, moduleTranslation);
+      })
       .Case([&](omp::TaskOp op) {
         return convertOmpTaskOp(op, builder, moduleTranslation);
       })
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 9368cb4c2f1657..088d9a765b9773 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -200,6 +200,15 @@ declare_mlir_dialect_extension_python_bindings(
   DIALECT_NAME transform
   EXTENSION_NAME memref_transform)
 
+declare_mlir_dialect_extension_python_bindings(
+  ADD_TO_PARENT MLIRPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/NVGPUTransformOps.td
+  SOURCES
+    dialects/transform/nvgpu.py
+  DIALECT_NAME transform
+  EXTENSION_NAME nvgpu_transform)
+
 declare_mlir_dialect_extension_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
diff --git a/mlir/python/mlir/dialects/NVGPUTransformOps.td b/mlir/python/mlir/dialects/NVGPUTransformOps.td
new file mode 100644
index 00000000000000..1f504e3220e92d
--- /dev/null
+++ b/mlir/python/mlir/dialects/NVGPUTransformOps.td
@@ -0,0 +1,20 @@
+//===-- NVGPUTransformOps.td -------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Entry point of the Python bindings generator for the transform ops provided
+// by the NVGPU dialect.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef PYTHON_BINDINGS_NVGPU_TRANSFORM_OPS
+#define PYTHON_BINDINGS_NVGPU_TRANSFORM_OPS
+
+include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td"
+
+#endif // PYTHON_BINDINGS_NVGPU_TRANSFORM_OPS
diff --git a/mlir/python/mlir/dialects/transform/nvgpu.py b/mlir/python/mlir/dialects/transform/nvgpu.py
new file mode 100644
index 00000000000000..74ba4c9aeb6c10
--- /dev/null
+++ b/mlir/python/mlir/dialects/transform/nvgpu.py
@@ -0,0 +1,5 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from .._nvgpu_transform_ops_gen import *
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index 7671d33293f483..a596f8747ebe76 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -1,4 +1,4 @@
-numpy>=1.19.5, <=1.23.5
+numpy>=1.19.5, <=1.26
 pybind11>=2.9.0, <=2.10.3
-PyYAML>= 5.3.1, <=6.0
-dataclasses>=0.6, <=0.8
+PyYAML>=5.3.1, <=6.0.1
+dataclasses>=0.6, <=0.8
\ No newline at end of file
diff --git a/mlir/test/Analysis/DataFlow/test-dead-code-analysis.mlir b/mlir/test/Analysis/DataFlow/test-dead-code-analysis.mlir
index 020c8b2a0e84dc..b1af670440c2d9 100644
--- a/mlir/test/Analysis/DataFlow/test-dead-code-analysis.mlir
+++ b/mlir/test/Analysis/DataFlow/test-dead-code-analysis.mlir
@@ -118,6 +118,18 @@ func.func @test_callgraph(%cond: i1, %arg0: i32) -> i32 {
   return %2 : i32
 }
 
+func.func private @bax(%arg0: i32) {
+  return {void_return}
+}
+
+func.func @test_callgraph_void_return(%arg0: i32) -> i32 {
+  // CHECK: call_void_return:
+  // CHECK: op_preds: (all) predecessors:
+  // CHECK:   func.return {void_return}
+  func.call @bax(%arg0) {tag = "call_void_return"}: (i32) -> ()
+  return %arg0 : i32
+}
+
 // CHECK: test_unknown_branch:
 // CHECK:  region #0
 // CHECK:   ^bb0 = live
diff --git a/mlir/test/Analysis/DataFlow/test-last-modified-callgraph.mlir b/mlir/test/Analysis/DataFlow/test-last-modified-callgraph.mlir
index 4a243571c231a1..709d787bb306be 100644
--- a/mlir/test/Analysis/DataFlow/test-last-modified-callgraph.mlir
+++ b/mlir/test/Analysis/DataFlow/test-last-modified-callgraph.mlir
@@ -68,6 +68,23 @@ func.func @test_multiple_return_sites(%cond: i1, %a: i32, %ptr: memref<i32>) ->
 
 // -----
 
+// CHECK-LABEL: test_tag: after_call
+// CHECK: operand #0
+// CHECK-NEXT: - write0
+func.func private @void_return(%ptr: memref<i32>) {
+  return
+}
+
+func.func @test_call_void_return() {
+  %ptr = memref.alloc() : memref<i32>
+  %c0 = arith.constant 0 : i32
+  memref.store %c0, %ptr[] {tag_name = "write0"} : memref<i32>
+  func.call @void_return(%ptr) : (memref<i32>) -> ()
+  memref.load %ptr[] {tag = "after_call"} : memref<i32>
+  return
+}
+
+// -----
 
 func.func private @callee(%arg0: memref<f32>) -> memref<f32> {
   %2 = arith.constant 2.0 : f32
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 8c2f8dbbd5ad9a..e54b62a06d4313 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -674,7 +674,7 @@ module @mymodule {
 !tensorMap = !nvgpu.tensormap.descriptor<tensor = memref<128x64xf16,3>, swizzle = swizzle_128b, l2promo=none, oob=zero, interleave=none>
 memref.global "private" @dynamicShmem : memref<0xf16,3>
 // CHECK-LABEL: func @create_wgmma_descriptor(
-func.func @create_wgmma_descriptor(%tensorMap : !tensorMap) -> !nvgpu.wgmma.descriptor<tensor=memref<128x64xf16,3>>{
+func.func @create_wgmma_descriptor(%tensorMap : !tensorMap) -> !nvgpu.warpgroup.descriptor<tensor=memref<128x64xf16,3>>{
   %dynamicMem = memref.get_global @dynamicShmem : memref<0xf16, 3>
   %lhsShmem = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [128,64], strides: [64,1] : memref<0xf16, 3> to memref<128x64xf16,3>
     // CHECK: %[[S0:.+]] = memref.get_global @dynamicShmem : memref<0xf16, 3>
@@ -706,22 +706,22 @@ func.func @create_wgmma_descriptor(%tensorMap : !tensorMap) -> !nvgpu.wgmma.desc
     // CHECK: %[[S25:.+]] = llvm.mlir.constant(0 : i64) : i64
     // CHECK: %[[S26:.+]] = llvm.shl %[[S7]], %[[S25]]  : i64
     // CHECK: %[[S27:.+]] = llvm.or %[[S24]], %[[S26]]  : i64
-    // CHECK: %[[ret:.+]] = builtin.unrealized_conversion_cast %[[S27]] : i64 to !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>> 
+    // CHECK: %[[ret:.+]] = builtin.unrealized_conversion_cast %[[S27]] : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> 
     // CHECK: return %[[ret]]
-  %descA = nvgpu.wgmma.generate.descriptor %lhsShmem, %tensorMap : memref<128x64xf16,3>, !tensorMap -> !nvgpu.wgmma.descriptor<tensor=memref<128x64xf16,3>>
-  func.return %descA : !nvgpu.wgmma.descriptor<tensor=memref<128x64xf16,3>>
+  %descA = nvgpu.warpgroup.generate.descriptor %lhsShmem, %tensorMap : memref<128x64xf16,3>, !tensorMap -> !nvgpu.warpgroup.descriptor<tensor=memref<128x64xf16,3>>
+  func.return %descA : !nvgpu.warpgroup.descriptor<tensor=memref<128x64xf16,3>>
 }
 
 // CHECK-LABEL: @warpgroup_mma_128_128_64(  
-// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>, %[[arg1:[a-zA-Z0-9_]+]]: !nvgpu.wgmma.descriptor<tensor = memref<64x128xf16, 3>>, %[[arg2:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>, %[[arg3:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>)
+// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, %[[arg1:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, %[[arg2:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>, %[[arg3:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>)
 func.func @warpgroup_mma_128_128_64(
-      %descA: !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>, 
-      %descB: !nvgpu.wgmma.descriptor<tensor = memref<64x128xf16, 3>>, 
+      %descA: !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
+      %descB: !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
       %acc1: !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>>,
       %acc2: !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>>) 
 {
-// CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>> to i64
-// CHECK: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.wgmma.descriptor<tensor = memref<64x128xf16, 3>> to i64
+// CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> to i64
+// CHECK: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>> to i64
 // CHECK: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[arg2]] : !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>> to !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
 // CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[arg3]] : !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>> to !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
 // CHECK: nvvm.wgmma.fence.aligned
@@ -762,8 +762,8 @@ func.func @warpgroup_mma_128_128_64(
 // CHECK: nvvm.wgmma.commit.group.sync.aligned
 // CHECK: nvvm.wgmma.wait.group.sync.aligned 1  
   %wgmmaResult, %wgmmaResult2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2 {transposeB}: 
-      !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>, 
-      !nvgpu.wgmma.descriptor<tensor = memref<64x128xf16, 3>>, 
+      !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
+      !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
       !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>>,
       !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>> 
       -> 
@@ -772,6 +772,135 @@ func.func @warpgroup_mma_128_128_64(
   return
 }
 
+// CHECK-LABEL: @warpgroup_mma_store(  
+// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>, %[[arg1:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>, %[[arg2:[a-zA-Z0-9_]+]]: memref<128x128xf32, 3>)
+func.func @warpgroup_mma_store(
+    %result1 : !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>>,
+    %result2 : !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>>, 
+    %matrixD: memref<128x128xf32,3>) {
+// CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>> to !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+// CHECK: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>> to !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+// CHECK: %[[S6:.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[S5:.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK: %[[S2:.+]] = llvm.mlir.constant(4 : i32) : i32
+// CHECK: %[[S4:.+]] = llvm.mlir.constant(8 : i32) : i32
+// CHECK: %[[S7:.+]] = llvm.mlir.constant(16 : i32) : i32
+// CHECK: %[[WarpSize:.+]] = llvm.mlir.constant(32 : i32) : i32
+
+// ### Store {d0, d1} of each thread ###
+
+// CHECK: %[[S8:.+]] = nvvm.read.ptx.sreg.tid.x : i32
+// CHECK: %[[S9:.+]] = llvm.urem %[[S8]], %[[WarpSize]]  : i32
+// CHECK: %[[S10:.+]] = llvm.udiv %[[S8]], %[[WarpSize]]  : i32
+// CHECK: %[[S11:.+]] = llvm.udiv %[[S9]], %[[S2]]  : i32
+// CHECK: %[[S12:.+]] = llvm.urem %[[S9]], %[[S2]]  : i32
+// CHECK: %[[S13:.+]] = llvm.mul %[[S12]], %[[S5]]  : i32
+// CHECK: %[[S14:.+]] = llvm.mul %[[S10]], %[[S7]]  : i32
+// CHECK: %[[S15:.+]] = llvm.add %[[S11]], %[[S14]]  : i32
+// CHECK: %[[S16:.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[S17:.+]] = llvm.mul %[[S16]], %[[S4]]  : i32
+// CHECK: %[[S18:.+]] = llvm.add %[[S15]], %[[S17]]  : i32
+// CHECK: %[[S19:.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[S20:.+]] = llvm.mul %[[S19]], %[[S4]]  : i32
+// CHECK: %[[S21:.+]] = llvm.add %[[S13]], %[[S20]]  : i32
+// CHECK: %[[S22:.+]] = arith.index_cast %[[S18]] : i32 to index
+// CHECK: %[[S23:.+]] = arith.index_cast %[[S21]] : i32 to index
+// CHECK: %[[S24:.+]] = llvm.add %[[S21]], %[[S6]]  : i32
+// CHECK: %[[S25:.+]] = arith.index_cast %[[S24]] : i32 to index
+// CHECK: %[[S26:.+]] = llvm.extractvalue %[[S0]][0] : !llvm.struct
+// CHECK: %[[S27:.+]] = llvm.extractvalue %[[S0]][1] : !llvm.struct
+// CHECK: memref.store %[[S26]], %[[arg2]][%[[S22]], %[[S23]]] : memref<128x128xf32, 3>
+// CHECK: memref.store %[[S27]], %[[arg2]][%[[S22]], %[[S25]]] : memref<128x128xf32, 3>
+
+// ### Store {d2, d3} of each thread ###
+
+// CHECK: %[[S28:.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[S29:.+]] = llvm.mul %[[S28]], %[[S4]]  : i32
+// CHECK: %[[S30:.+]] = llvm.add %[[S13]], %[[S29]]  : i32
+// CHECK: %[[S31:.+]] = arith.index_cast %[[S18]] : i32 to index
+// CHECK: %[[S32:.+]] = arith.index_cast %[[S30]] : i32 to index
+// CHECK: %[[S33:.+]] = llvm.add %[[S30]], %[[S6]]  : i32
+// CHECK: %[[S34:.+]] = arith.index_cast %[[S33]] : i32 to index
+// CHECK: %[[S35:.+]] = llvm.extractvalue %[[S0]][4] : !llvm.struct<
+// CHECK: %[[S36:.+]] = llvm.extractvalue %[[S0]][5] : !llvm.struct<
+// CHECK: memref.store %[[S35]], %[[arg2]][%[[S31]], %[[S32]]] : memref<128x128xf32, 3>
+// CHECK: memref.store %[[S36]], %[[arg2]][%[[S31]], %[[S34]]] : memref<128x128xf32, 3>
+
+// ### Store {d4, d5} of each thread ### 
+
+// CHECK: %[[S37:.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK: %[[S38:.+]] = llvm.mul %[[S37]], %[[S4]]  : i32
+// CHECK: %[[S39:.+]] = llvm.add %[[S13]], %[[S38]]  : i32
+// CHECK: %[[S40:.+]] = arith.index_cast %[[S18]] : i32 to index
+// CHECK: %[[S41:.+]] = arith.index_cast %[[S39]] : i32 to index
+// CHECK: %[[S42:.+]] = llvm.add %[[S39]], %[[S6]]  : i32
+// CHECK: %[[S43:.+]] = arith.index_cast %[[S42]] : i32 to index
+// CHECK: %[[S44:.+]] = llvm.extractvalue %[[S0]][8] : !llvm.struct<
+// CHECK: %[[S45:.+]] = llvm.extractvalue %[[S0]][9] : !llvm.struct<
+// CHECK: memref.store %[[S44]], %[[arg2]][%[[S40]], %[[S41]]] : memref<128x128xf32, 3>
+// CHECK: memref.store %[[S45]], %[[arg2]][%[[S40]], %[[S43]]] : memref<128x128xf32, 3>
+
+// ### Store {d6, d7} of each thread ### 
+
+// CHECK: %[[S46:.+]] = llvm.mlir.constant(3 : i32) : i32
+// CHECK: %[[S47:.+]] = llvm.mul %[[S46]], %[[S4]]  : i32
+// CHECK: %[[S48:.+]] = llvm.add %[[S13]], %[[S47]]  : i32
+// CHECK: %[[S49:.+]] = arith.index_cast %[[S18]] : i32 to index
+// CHECK: %[[S50:.+]] = arith.index_cast %[[S48]] : i32 to index
+// CHECK: %[[S51:.+]] = llvm.add %[[S48]], %[[S6]]  : i32
+// CHECK: %[[S52:.+]] = arith.index_cast %[[S51]] : i32 to index
+// CHECK: %[[S53:.+]] = llvm.extractvalue %[[S0]][12] : !llvm.struct<
+// CHECK: %[[S54:.+]] = llvm.extractvalue %[[S0]][13] : !llvm.struct<
+// CHECK: memref.store %[[S53]], %[[arg2]][%[[S49]], %[[S50]]] : memref<128x128xf32, 3>
+// CHECK: memref.store %[[S54]], %[[arg2]][%[[S49]], %[[S52]]] : memref<128x128xf32, 3>
+
+// Pattern continues similarly 28x times until {... d62, d63}
+
+// CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[c2:.+]] = llvm.mlir.constant(2 : i32) : i32
+
+// ### Store {d64, d65} of each thread ### 
+
+// CHECK: %[[S315:.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[S312:.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK: %[[S311:.+]] = llvm.mlir.constant(4 : i32) : i32
+// CHECK: %[[S313:.+]] = llvm.mlir.constant(8 : i32) : i32
+// CHECK: %[[S316:.+]] = llvm.mlir.constant(16 : i32) : i32
+// CHECK: %[[WS2:.+]] = llvm.mlir.constant(32 : i32) : i32
+// CHECK: %[[S317:.+]] = nvvm.read.ptx.sreg.tid.x : i32
+// CHECK: %[[S318:.+]] = llvm.urem %[[S317]], %[[WS2]]  : i32
+// CHECK: %[[S319:.+]] = llvm.udiv %[[S317]], %[[WS2]]  : i32
+// CHECK: %[[S320:.+]] = llvm.udiv %[[S318]], %[[S311]]  : i32
+// CHECK: %[[S321:.+]] = llvm.urem %[[S318]], %[[S311]]  : i32
+// CHECK: %[[S322:.+]] = llvm.mul %[[S321]], %[[S312]]  : i32
+// CHECK: %[[S323:.+]] = llvm.mul %[[S319]], %[[S316]]  : i32
+// CHECK: %[[S324:.+]] = llvm.add %[[S320]], %[[S323]]  : i32
+// CHECK: %[[S325:.+]] = llvm.mlir.constant(64 : i32) : i32
+// CHECK: %[[S326:.+]] = llvm.add %[[S324]], %[[S325]]  : i32
+// CHECK: %[[S327:.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[S328:.+]] = llvm.mul %[[S327]], %[[S313]]  : i32
+// CHECK: %[[S329:.+]] = llvm.add %[[S326]], %[[S328]]  : i32
+// CHECK: %[[S330:.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[S331:.+]] = llvm.mul %[[S330]], %[[S313]]  : i32
+// CHECK: %[[S332:.+]] = llvm.add %[[S322]], %[[S331]]  : i32
+// CHECK: %[[S333:.+]] = arith.index_cast %[[S329]] : i32 to index
+// CHECK: %[[S334:.+]] = arith.index_cast %[[S332]] : i32 to index
+// CHECK: %[[S335:.+]] = llvm.add %[[S332]], %[[S315]]  : i32
+// CHECK: %[[S336:.+]] = arith.index_cast %[[S335]] : i32 to index
+// CHECK: %[[S337:.+]] = llvm.extractvalue %[[S1]][0] 
+// CHECK: %[[S338:.+]] = llvm.extractvalue %[[S1]][1]  
+// CHECK: memref.store %[[S337]], %[[arg2]][%[[S333]], %[[S334]]] : memref<128x128xf32, 3>
+// CHECK: memref.store %[[S338]], %[[arg2]][%[[S333]], %[[S336]]] : memref<128x128xf32, 3>
+
+// Pattern continues similarly 31x times until {... d126, d127}
+
+  nvgpu.warpgroup.mma.store [%result1, %result2], %matrixD : 
+    !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>>,
+    !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>> 
+    to memref<128x128xf32,3>
+  return 
+}
+
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["func.func"]} in %arg1 
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 7ffe1ad2bb2b11..0092dca3506403 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @init_mbarrier_arrive_expect_tx
 llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32) {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"  
   nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32
   llvm.return
 }
@@ -20,14 +20,32 @@ llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount
 
 // CHECK-LABEL: @init_mbarrier_try_wait_shared
 llvm.func @init_mbarrier_try_wait_shared(%barrier : !llvm.ptr<3>, %ticks : i32, %phase : i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "{\0A\09.reg .pred       P1; \0A\09LAB_WAIT: \0A\09mbarrier.try_wait.parity.shared.b64 P1, [$0], $1, $2; \0A\09@P1 bra.uni DONE; \0A\09bra.uni     LAB_WAIT; \0A\09DONE: \0A\09}", "r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{
+  // CHECK-SAME: .reg .pred       P1;
+  // CHECK-SAME: LAB_WAIT: 
+  // CHECK-SAME: mbarrier.try_wait.parity.shared.b64 P1, [$0], $1, $2;
+  // CHECK-SAME: @P1 bra.uni DONE;
+  // CHECK-SAME: bra.uni     LAB_WAIT;
+  // CHECK-SAME: DONE:
+  // CHECK-SAME: }",
+  // CHECK-SAME: "r,r,r"
    nvvm.mbarrier.try_wait.parity.shared %barrier, %phase, %ticks : !llvm.ptr<3>, i32, i32
   llvm.return
 }
 
 // CHECK-LABEL: @init_mbarrier_try_wait
 llvm.func @init_mbarrier_try_wait(%barrier : !llvm.ptr, %ticks : i32, %phase : i32){
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "{\0A\09.reg .pred       P1; \0A\09LAB_WAIT: \0A\09mbarrier.try_wait.parity.b64 P1, [$0], $1, $2; \0A\09@P1 bra.uni DONE; \0A\09bra.uni     LAB_WAIT; \0A\09DONE: \0A\09}", "l,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att
+  // CHECK-SAME: "{
+  // CHECK-SAME: .reg .pred       P1;
+  // CHECK-SAME: LAB_WAIT: 
+  // CHECK-SAME: mbarrier.try_wait.parity.b64 P1, [$0], $1, $2;
+  // CHECK-SAME: @P1 bra.uni DONE;
+  // CHECK-SAME: bra.uni     LAB_WAIT;
+  // CHECK-SAME: DONE:
+  // CHECK-SAME: }",
+  // CHECK-SAME: "l,r,r"
   nvvm.mbarrier.try_wait.parity %barrier, %phase, %ticks : !llvm.ptr, i32, i32
   llvm.return
 }
@@ -43,79 +61,93 @@ func.func @async_cp(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>) {
 
 // CHECK-LABEL: @async_cp_zfill
 func.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", 
+  // CHECK-SAME: "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
   nvvm.cp.async.shared.global %dst, %src, 16, cache =  cg, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.ca.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.ca.shared.global [$0], [$1], $2, $3;\0A", 
+  // CHECK-SAME: "r,l,n,r" %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
   nvvm.cp.async.shared.global %dst, %src, 4, cache =  ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
   return
 }
 
 // CHECK-LABEL: @tma_load_1d
 func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3} ], [$2];", "r,l,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3} ], [$2];", "r,l,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32
   return
 }
 
 // CHECK-LABEL: @tma_load_2d
 func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3, $4} ], [$2];", "r,l,r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3, $4} ], [$2];", "r,l,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32
   return
 }
 
 // CHECK-LABEL: @tma_load_3d
 func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3, $4, $5} ], [$2];", "r,l,r,r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3, $4, $5} ], [$2];", "r,l,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32, i32
   return
 }
 
 // CHECK-LABEL: @tma_load_4d
 func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3, $4, $5, $6} ], [$2];", "r,l,r,r,r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3, $4, $5, $6} ], [$2];", "r,l,r,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32
   return
 }
 
 // CHECK-LABEL: @tma_load_5d
 func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3, $4, $5, $6, $7} ], [$2];", "r,l,r,r,r,r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$3, $4, $5, $6, $7} ], [$2];", "r,l,r,r,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32
   return
 }
 
 // CHECK-LABEL: @tma_store_1d
 func.func @tma_store_1d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r"
   nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0] : !llvm.ptr, !llvm.ptr<3>, i32
   return
 }
 
 // CHECK-LABEL: @tma_store_2d
 func.func @tma_store_2d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r"
   nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1] : !llvm.ptr, !llvm.ptr<3>, i32, i32
   return
 }
 
 // CHECK-LABEL: @tma_store_3d
 func.func @tma_store_3d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r"
   nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32
   return
 }
 
 // CHECK-LABEL: @tma_store_4d
 func.func @tma_store_4d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32) {
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r"
   nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32
   return
 }
 
 // CHECK-LABEL: @tma_store_5d
 func.func @tma_store_5d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32) {
-  // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r"
   nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32
   return
 }
@@ -161,7 +193,16 @@ func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{
   // CHECK: %[[V4:.*]] = llvm.extractvalue %[[RES]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
   // CHECK: %[[V11:.*]] = llvm.extractvalue %[[RES]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>  
   // CHECK: %[[V13:.*]] = llvm.extractvalue %[[RES]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // CHECK: %[[RES1:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $34, 0;\0Awgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" %[[V0]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11]], %{{.*}}, %[[V13]], %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]] : (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, i64, i64, i32, i32, i32, i32, i32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+  // CHECK: %[[RES1:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{
+  // CHECK-SAME: reg .pred p;
+  // CHECK-SAME: setp.ne.b32 p, $34, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 
+  // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A", 
+  // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" 
+  // CHECK-SAME: %[[V0]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11]], %{{.*}}, %[[V13]], %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]] 
+  // CHECK-SAME: : (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, i64, i64, i32, i32, i32, i32, i32) 
+  // CHECK-SAME: -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
   // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
   // CHECK: %[[DESCa:.+]] = llvm.add %[[ARG0]], %[[C2]] : i64
   // CHECK: %[[DESCb:.+]] = llvm.add %[[ARG1]], %[[C2]] : i64
@@ -169,7 +210,14 @@ func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{
   // CHECK: %[[V4_2:.*]] = llvm.extractvalue %[[RES1]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
   // CHECK: %[[V11_2:.*]] = llvm.extractvalue %[[RES1]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>  
   // CHECK: %[[V13_2:.*]] = llvm.extractvalue %[[RES1]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $34, 0;\0Awgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" %[[V0_2]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4_2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11_2]], %{{.*}}, %[[V13_2]], %{{.*}}, %{{.*}}, %[[DESCa]], %[[DESCb]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, i64, i64, i32, i32, i32, i32, i32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{
+    // CHECK-SAME: .reg .pred p;
+    // CHECK-SAME: setp.ne.b32 p, $34, 0;
+    // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 
+    // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A", 
+    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" 
+    // CHECK-SAME: %[[V0_2]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4_2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11_2]], %{{.*}}, %[[V13_2]], %{{.*}}, %{{.*}}, %[[DESCa]], %[[DESCb]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} 
   %result = llvm.mlir.undef : !mat64f32
   %result1 = nvvm.wgmma.mma_async 
       %descA, %descB, 
@@ -205,17 +253,37 @@ func.func @wgmma_s32_s8_s8_satfinite(%descA : i64, %descB : i64) -> !mat16i32{
 // CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1]
 // CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2]
 // CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3]
-// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $10, 0;\0Awgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : (i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
+// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME: "{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 
+// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 
+// CHECK-SAME: %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : 
+// CHECK-SAME: (i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
 // CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0]
 // CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1]
 // CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2]
 // CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3]
-// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $10, 0;\0Awgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
+// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME: "{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 
+// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", 
+// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" 
+// CHECK-SAME: %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
 // CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0]
 // CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1]
 // CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2]
 // CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3]
-// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $10, 0;\0Awgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
+// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME:"{
+// CHECK-SAME:.reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite
+// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 
+// CHECK-SAME: %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
   %result1 = nvvm.wgmma.mma_async %descA, %descB, 
       #nvvm.shape<m = 64, n = 8, k = 32>, 
       D [%result, #nvvm.wgmma_scale_out<one>, <satfinite>],
@@ -246,17 +314,36 @@ func.func @wgmma_s32_u8_u8(%descA : i64, %descB : i64) -> !mat16i32 {
 // CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1]
 // CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2]
 // CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3]
-// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $10, 0;\0Awgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : (i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
+// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME: "{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
+// CHECK-SAME: }\0A",
+// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : 
+// CHECK-SAME:(i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
 // CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0]
 // CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1]
 // CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2]
 // CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3]
-// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $10, 0;\0Awgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
+// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME:"{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
+// CHECK-SAME: }\0A",
+// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
 // CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0]
 // CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1]
 // CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2]
 // CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3]
-// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $10, 0;\0Awgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
+// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 
+// CHECK-SAME:"{
+// CHECK-SAME: .reg .pred p;
+// CHECK-SAME: setp.ne.b32 p, $10, 0;
+// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
+// CHECK-SAME:}\0A", 
+// CHECK-SAME:"=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
   %result = llvm.mlir.undef : !mat16i32
   %result1 = nvvm.wgmma.mma_async %descA, %descB, 
       #nvvm.shape<m = 64, n = 8, k = 32>, 
@@ -289,8 +376,16 @@ func.func @wgmma_s32_u8_u8(%descA : i64, %descB : i64) -> !mat16i32 {
 
 // CHECK-LABEL: @wgmma_f32_tf32_tf32
 func.func @wgmma_f32_tf32_tf32(%descA : i64, %descB : i64) -> !mat32f32 {  
-  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;\0Awgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;\0Awgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME:"{
+  // CHECK-SAME: .reg .pred p;
+  // CHECK-SAME: setp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{
+  // CHECK-SAME: .reg .pred p;
+  // CHECK-SAME: setp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
   %result = llvm.mlir.undef : !mat32f32
   %result1 = nvvm.wgmma.mma_async %descA, %descB, 
       #nvvm.shape<m = 64, n = 64, k = 8>, 
@@ -318,8 +413,12 @@ func.func @wgmma_f32_tf32_tf32(%descA : i64, %descB : i64) -> !mat32f32 {
 
 // CHECK-LABEL: @wgmma_f32_e4m3_e4m3
 func.func @wgmma_f32_e4m3_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {  
-  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;\0Awgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;\0Awgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
   %result = llvm.mlir.undef : !mat32f32
   %result1 = nvvm.wgmma.mma_async %descA, %descB, 
       #nvvm.shape<m = 64, n = 64, k = 32>, 
@@ -346,8 +445,12 @@ func.func @wgmma_f32_e4m3_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {
 
 // CHECK-LABEL: @wgmma_f32_e5m2_e4m3
 func.func @wgmma_f32_e5m2_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {  
-  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;\0Awgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
-  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;\0Awgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
+  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 
+  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
+  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
   %result = llvm.mlir.undef : !mat32f32
   %result1 = nvvm.wgmma.mma_async %descA, %descB, 
       #nvvm.shape<m = 64, n = 64, k = 32>, 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index b08f4969ef5081..8e0307085f1ce2 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -447,7 +447,7 @@ func.func @test_simple_f32(%arg0: tensor<1xf32>) -> () {
 
   // CHECK: linalg.generic
   // CHECK: arith.mulf
-  %4 = tosa.mul %0, %1 {shift = 0 : i32} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %4 = tosa.mul %0, %1 {shift = 0 : i8} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
 
   // CHECK: linalg.generic
   // CHECK: arith.negf
@@ -570,7 +570,7 @@ func.func @test_simple_i16(%arg0: tensor<1xi16>) -> () {
   // CHECK: arith.extsi
   // CHECK: arith.extsi
   // CHECK: arith.muli
-  %0 = tosa.mul %arg0, %arg0 {shift = 0 : i32} : (tensor<1xi16>, tensor<1xi16>) -> tensor<1xi32>
+  %0 = tosa.mul %arg0, %arg0 {shift = 0 : i8} : (tensor<1xi16>, tensor<1xi16>) -> tensor<1xi32>
 
   return
 }
@@ -598,12 +598,12 @@ func.func @test_simple_i32(%arg0: tensor<1xi32>) -> () {
 
   // CHECK: linalg.generic
   // CHECK: arith.muli
-  %2 = tosa.mul %arg0, %arg0 {shift = 0 : i32} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  %2 = tosa.mul %arg0, %arg0 {shift = 0 : i8} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
 
   // CHECK: linalg.generic
   // CHECK: arith.constant 2
   // CHECK: apply_scale
-  %3 = tosa.mul %arg0, %arg0 {shift = 2 : i32} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  %3 = tosa.mul %arg0, %arg0 {shift = 2 : i8} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
 
   // CHECK: linalg.generic
   // CHECK: arith.divsi
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/invalid-zero-size.mlir b/mlir/test/Dialect/Affine/SuperVectorize/invalid-zero-size.mlir
new file mode 100644
index 00000000000000..396cc933e25bcf
--- /dev/null
+++ b/mlir/test/Dialect/Affine/SuperVectorize/invalid-zero-size.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s -verify-diagnostics --affine-super-vectorize=virtual-vector-size=0
+
+// expected-error@+1 {{Vectorization factor must be greater than zero}} 
+func.func @with_zero_vector_size(%arg0: memref<21x12x12xi1>) {
+  affine.for %arg1 = 0 to 84 step 4294967295 {
+  }
+  return
+}
+
diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir
index 1dc3451ed7db87..1bcb6fc4a365dd 100644
--- a/mlir/test/Dialect/Affine/invalid.mlir
+++ b/mlir/test/Dialect/Affine/invalid.mlir
@@ -297,6 +297,18 @@ func.func @affine_parallel(%arg0 : index, %arg1 : index, %arg2 : index) {
 
 // -----
 
+func.func @affine_parallel(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = memref.alloc() : memref<100x100xi32>
+  //  expected-error@+1 {{result type cannot match reduction attribute}}
+  %1 = affine.parallel (%i, %j) = (0, 0) to (100, 100) step (10, 10) reduce ("minimumf") -> (i32) {
+    %2 = affine.load %0[%i, %j] : memref<100x100xi32>
+    affine.yield %2 : i32
+  }
+  return
+}
+
+// -----
+
 func.func @vector_load_invalid_vector_type() {
   %0 = memref.alloc() : memref<100xf32>
   affine.for %i0 = 0 to 16 step 8 {
diff --git a/mlir/test/Transforms/loop-fusion-dependence-check.mlir b/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir
similarity index 100%
rename from mlir/test/Transforms/loop-fusion-dependence-check.mlir
rename to mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir
diff --git a/mlir/test/Transforms/loop-fusion-slice-computation.mlir b/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir
similarity index 100%
rename from mlir/test/Transforms/loop-fusion-slice-computation.mlir
rename to mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir
diff --git a/mlir/test/Transforms/loop-fusion-transformation.mlir b/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir
similarity index 100%
rename from mlir/test/Transforms/loop-fusion-transformation.mlir
rename to mlir/test/Dialect/Affine/loop-fusion-transformation.mlir
diff --git a/mlir/test/Transforms/memref-bound-check.mlir b/mlir/test/Dialect/Affine/memref-bound-check.mlir
similarity index 100%
rename from mlir/test/Transforms/memref-bound-check.mlir
rename to mlir/test/Dialect/Affine/memref-bound-check.mlir
diff --git a/mlir/test/Transforms/memref-dependence-check.mlir b/mlir/test/Dialect/Affine/memref-dependence-check.mlir
similarity index 100%
rename from mlir/test/Transforms/memref-dependence-check.mlir
rename to mlir/test/Dialect/Affine/memref-dependence-check.mlir
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 84096354e6afe3..f697f3d01458ee 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2669,3 +2669,20 @@ func.func @extsi_poison() -> i64 {
   %1 = arith.extsi %0 : i32 to i64
   return %1 : i64
 }
+
+// Just checks that this doesn't crash.
+// CHECK-LABEL: @unsignedExtendConstantResource
+func.func @unsignedExtendConstantResource() -> tensor<i16> {
+  %c2 = arith.constant dense_resource<blob1> : tensor<i8>
+  %ext = arith.extui %c2 : tensor<i8> to tensor<i16>
+  return %ext : tensor<i16>
+}
+
+{-#
+  dialect_resources: {
+    builtin: {
+      // Note: This is just copied blob, the actual value isn't used or checked.
+      blob1: "0x08000000010000000000000002000000000000000300000000000000"
+    }
+  }
+#-}
diff --git a/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir b/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
index 687ef79385334c..32f46d9fd817c9 100644
--- a/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
@@ -496,3 +496,384 @@ func.func @vector_outerproduct_add_masked_f32(%lhs : vector<[4]xf32>, %rhs : vec
   %0 = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[4]xf32>, vector<[4]xf32> } : vector<[4]x[4]xi1> -> vector<[4]x[4]xf32>
   "prevent.dce"(%0) : (vector<[4]x[4]xf32>) -> ()
 }
+
+//===----------------------------------------------------------------------===//
+// vector.insert
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_i32(
+// CHECK-SAME:                       %[[TILE:.*]]: vector<[4]x[4]xi32>,
+// CHECK-SAME:                       %[[SLICE:.*]]: vector<[4]xi32>,
+// CHECK-SAME:                       %[[INDEX:.*]]: index)
+func.func @vector_insert_slice_i32(%tile: vector<[4]x[4]xi32>, %slice: vector<[4]xi32>, %row: index) -> vector<[4]x[4]xi32>{
+  // CHECK-NEXT: %[[PTRUE:.*]] = arith.constant dense<true> : vector<[4]xi1>
+  // CHECK-NEXT: %[[TILE_ID:.*]] = arm_sme.cast_vector_to_tile %[[TILE]] : vector<[4]x[4]xi32> to i32
+  // CHECK-NEXT: %[[TILE_SLICE_INDEX:.*]] = arith.index_castui %[[INDEX]] : index to i32
+  // CHECK-NEXT: "arm_sme.intr.write.horiz"(%[[TILE_ID]], %[[TILE_SLICE_INDEX]], %[[PTRUE]], %[[SLICE]]) : (i32, i32, vector<[4]xi1>, vector<[4]xi32>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[4]xi32> into vector<[4]x[4]xi32>
+  return %new_tile : vector<[4]x[4]xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_i8
+func.func @vector_insert_slice_i8(%tile: vector<[16]x[16]xi8>, %slice: vector<[16]xi8>, %row: index) -> vector<[16]x[16]xi8> {
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[16]xi1>, vector<[16]xi8>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[16]xi8> into vector<[16]x[16]xi8>
+  return %new_tile : vector<[16]x[16]xi8>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_i16
+func.func @vector_insert_slice_i16(%tile: vector<[8]x[8]xi16>, %slice: vector<[8]xi16>, %row: index) -> vector<[8]x[8]xi16> {
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[8]xi1>, vector<[8]xi16>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[8]xi16> into vector<[8]x[8]xi16>
+  return %new_tile : vector<[8]x[8]xi16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_i64
+func.func @vector_insert_slice_i64(%tile: vector<[2]x[2]xi64>, %slice: vector<[2]xi64>, %row: index) -> vector<[2]x[2]xi64> {
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[2]xi1>, vector<[2]xi64>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[2]xi64> into vector<[2]x[2]xi64>
+  return %new_tile : vector<[2]x[2]xi64>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_i128
+func.func @vector_insert_slice_i128(%tile: vector<[1]x[1]xi128>, %slice: vector<[1]xi128>, %row: index) -> vector<[1]x[1]xi128> {
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[1]xi1>, vector<[1]xi128>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[1]xi128> into vector<[1]x[1]xi128>
+  return %new_tile : vector<[1]x[1]xi128>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_f16
+func.func @vector_insert_slice_f16(%tile: vector<[8]x[8]xf16>, %slice: vector<[8]xf16>, %row: index) -> vector<[8]x[8]xf16> {
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[8]xi1>, vector<[8]xf16>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[8]xf16> into vector<[8]x[8]xf16>
+  return %new_tile : vector<[8]x[8]xf16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_bf16
+func.func @vector_insert_slice_bf16(%tile: vector<[8]x[8]xbf16>, %slice: vector<[8]xbf16>, %row: index) -> vector<[8]x[8]xbf16> {
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[8]xi1>, vector<[8]xbf16>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[8]xbf16> into vector<[8]x[8]xbf16>
+  return %new_tile : vector<[8]x[8]xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_f32
+func.func @vector_insert_slice_f32(%tile: vector<[4]x[4]xf32>, %slice: vector<[4]xf32>, %row: index) -> vector<[4]x[4]xf32> {
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[4]xi1>, vector<[4]xf32>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  return %new_tile : vector<[4]x[4]xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_slice_f64
+func.func @vector_insert_slice_f64(%tile: vector<[2]x[2]xf64>, %slice: vector<[2]xf64>, %row: index) -> vector<[2]x[2]xf64> {
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[2]xi1>, vector<[2]xf64>) -> ()
+  %new_tile = vector.insert %slice, %tile[%row] : vector<[2]xf64> into vector<[2]x[2]xf64>
+  return %new_tile : vector<[2]x[2]xf64>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_i32(
+// CHECK-SAME:                         %[[TILE:.*]]: vector<[4]x[4]xi32>,
+// CHECK-SAME:                         %[[EL:.*]]: i32,
+// CHECK-SAME:                         %[[ROW:.*]]: index,
+// CHECK-SAME:                         %[[COL:.*]]: index)
+func.func @vector_insert_element_i32(%tile: vector<[4]x[4]xi32>, %el: i32, %row: index, %col: index) -> vector<[4]x[4]xi32> {
+  // CHECK-NEXT: %[[ZERO_VEC:.*]] = arith.constant dense<0> : vector<[4]xi32>
+  // CHECK-NEXT: %[[PTRUE:.*]] = arith.constant dense<true> : vector<[4]xi1>
+  // CHECK-NEXT: %[[COL_I32:.*]] = builtin.unrealized_conversion_cast %[[COL]] : index to i64
+  // CHECK-NEXT: %[[TILE_ID:.*]] = arm_sme.cast_vector_to_tile %[[TILE]] : vector<[4]x[4]xi32> to i32
+  // CHECK-NEXT: %[[ROW_I32:.*]] = arith.index_cast %[[ROW]] : index to i32
+  // CHECK-NEXT: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%[[ZERO_VEC]], %[[PTRUE]], %[[TILE_ID]], %[[ROW_I32]]) : (vector<[4]xi32>, vector<[4]xi1>, i32, i32) -> vector<[4]xi32>
+  // CHECK-NEXT: %[[NEW_SLICE:.*]] = llvm.insertelement %[[EL]], %[[SLICE]]{{\[}}%[[COL_I32]] : i64] : vector<[4]xi32>
+  // CHECK-NEXT: %[[SLICE_INDEX:.*]] = arith.index_castui %[[ROW]] : index to i32
+  // CHECK-NEXT: "arm_sme.intr.write.horiz"(%[[TILE_ID]], %[[SLICE_INDEX]], %[[PTRUE]], %[[NEW_SLICE]]) : (i32, i32, vector<[4]xi1>, vector<[4]xi32>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : i32 into vector<[4]x[4]xi32>
+  return %new_tile : vector<[4]x[4]xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_i8
+func.func @vector_insert_element_i8(%tile: vector<[16]x[16]xi8>, %el: i8, %row: index, %col: index) -> vector<[16]x[16]xi8> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[16]xi8>, vector<[16]xi1>, i32, i32) -> vector<[16]xi8>
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[16]xi1>, vector<[16]xi8>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : i8 into vector<[16]x[16]xi8>
+  return %new_tile : vector<[16]x[16]xi8>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_i16
+func.func @vector_insert_element_i16(%tile: vector<[8]x[8]xi16>, %el: i16, %row: index, %col: index) -> vector<[8]x[8]xi16> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xi16>, vector<[8]xi1>, i32, i32) -> vector<[8]xi16>
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[8]xi1>, vector<[8]xi16>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : i16 into vector<[8]x[8]xi16>
+  return %new_tile : vector<[8]x[8]xi16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_i64
+func.func @vector_insert_element_i64(%tile: vector<[2]x[2]xi64>, %el: i64, %row: index, %col: index) -> vector<[2]x[2]xi64> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[2]xi64>, vector<[2]xi1>, i32, i32) -> vector<[2]xi64>
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[2]xi1>, vector<[2]xi64>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : i64 into vector<[2]x[2]xi64>
+  return %new_tile : vector<[2]x[2]xi64>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_i128
+func.func @vector_insert_element_i128(%tile: vector<[1]x[1]xi128>, %el: i128, %row: index, %col: index) -> vector<[1]x[1]xi128> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[1]xi128>, vector<[1]xi1>, i32, i32) -> vector<[1]xi128>
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[1]xi1>, vector<[1]xi128>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : i128 into vector<[1]x[1]xi128>
+  return %new_tile : vector<[1]x[1]xi128>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_f16
+func.func @vector_insert_element_f16(%tile: vector<[8]x[8]xf16>, %el: f16, %row: index, %col: index) -> vector<[8]x[8]xf16> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xf16>, vector<[8]xi1>, i32, i32) -> vector<[8]xf16>
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[8]xi1>, vector<[8]xf16>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : f16 into vector<[8]x[8]xf16>
+  return %new_tile : vector<[8]x[8]xf16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_bf16
+func.func @vector_insert_element_bf16(%tile: vector<[8]x[8]xbf16>, %el: bf16, %row: index, %col: index) -> vector<[8]x[8]xbf16> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xbf16>, vector<[8]xi1>, i32, i32) -> vector<[8]xbf16>
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[8]xi1>, vector<[8]xbf16>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : bf16 into vector<[8]x[8]xbf16>
+  return %new_tile : vector<[8]x[8]xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_f32
+func.func @vector_insert_element_f32(%tile: vector<[4]x[4]xf32>, %el: f32, %row: index, %col: index) -> vector<[4]x[4]xf32> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[4]xf32>, vector<[4]xi1>, i32, i32) -> vector<[4]xf32>
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[4]xi1>, vector<[4]xf32>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : f32 into vector<[4]x[4]xf32>
+  return %new_tile : vector<[4]x[4]xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_insert_element_f64
+func.func @vector_insert_element_f64(%tile: vector<[2]x[2]xf64>, %el: f64, %row: index, %col: index) -> vector<[2]x[2]xf64> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[2]xf64>, vector<[2]xi1>, i32, i32) -> vector<[2]xf64>
+  // CHECK: "arm_sme.intr.write.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (i32, i32, vector<[2]xi1>, vector<[2]xf64>) -> ()
+  %new_tile = vector.insert %el, %tile[%row, %col] : f64 into vector<[2]x[2]xf64>
+  return %new_tile : vector<[2]x[2]xf64>
+}
+
+//===----------------------------------------------------------------------===//
+// vector.extract
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_i32(
+// CHECK-SAME:                        %[[TILE:.*]]: vector<[4]x[4]xi32>,
+// CHECK-SAME:                        %[[INDEX:.*]]: index)
+func.func @vector_extract_slice_i32(%tile: vector<[4]x[4]xi32>, %row: index) -> vector<[4]xi32> {
+  // CHECK-NEXT: %[[ZERO_VEC:.*]] = arith.constant dense<0> : vector<[4]xi32>
+  // CHECK-NEXT: %[[PTRUE:.*]] = arith.constant dense<true> : vector<[4]xi1>
+  // CHECK-NEXT: %[[TILE_ID:.*]] = arm_sme.cast_vector_to_tile %[[TILE]] : vector<[4]x[4]xi32> to i32
+  // CHECK-NEXT: %[[TILE_SLICE_INDEX:.*]] = arith.index_cast %[[INDEX]] : index to i32
+  // CHECK-NEXT: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%[[ZERO_VEC]], %[[PTRUE]], %[[TILE_ID]], %[[TILE_SLICE_INDEX]]) : (vector<[4]xi32>, vector<[4]xi1>, i32, i32) -> vector<[4]xi32>
+  %slice = vector.extract %tile[%row] : vector<[4]xi32> from vector<[4]x[4]xi32>
+  return %slice : vector<[4]xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_i8
+func.func @vector_extract_slice_i8(%tile: vector<[16]x[16]xi8>, %row: index) -> vector<[16]xi8> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[16]xi8>, vector<[16]xi1>, i32, i32) -> vector<[16]xi8>
+  %slice = vector.extract %tile[%row] : vector<[16]xi8> from vector<[16]x[16]xi8>
+  return %slice : vector<[16]xi8>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_i16
+func.func @vector_extract_slice_i16(%tile: vector<[8]x[8]xi16>, %row: index) -> vector<[8]xi16> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xi16>, vector<[8]xi1>, i32, i32) -> vector<[8]xi16>
+  %slice = vector.extract %tile[%row] : vector<[8]xi16> from vector<[8]x[8]xi16>
+  return %slice : vector<[8]xi16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_i64
+func.func @vector_extract_slice_i64(%tile: vector<[2]x[2]xi64>, %row: index) -> vector<[2]xi64> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[2]xi64>, vector<[2]xi1>, i32, i32) -> vector<[2]xi64>
+  %slice = vector.extract %tile[%row] : vector<[2]xi64> from vector<[2]x[2]xi64>
+  return %slice : vector<[2]xi64>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_i128
+func.func @vector_extract_slice_i128(%tile: vector<[1]x[1]xi128>, %row: index) -> vector<[1]xi128> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[1]xi128>, vector<[1]xi1>, i32, i32) -> vector<[1]xi128>
+  %slice = vector.extract %tile[%row] : vector<[1]xi128> from vector<[1]x[1]xi128>
+  return %slice : vector<[1]xi128>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_f16
+func.func @vector_extract_slice_f16(%tile: vector<[8]x[8]xf16>, %row: index) -> vector<[8]xf16> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xf16>, vector<[8]xi1>, i32, i32) -> vector<[8]xf16>
+  %slice = vector.extract %tile[%row] : vector<[8]xf16> from vector<[8]x[8]xf16>
+  return %slice : vector<[8]xf16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_bf16
+func.func @vector_extract_slice_bf16(%tile: vector<[8]x[8]xbf16>, %row: index) -> vector<[8]xbf16> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xbf16>, vector<[8]xi1>, i32, i32) -> vector<[8]xbf16>
+  %slice = vector.extract %tile[%row] : vector<[8]xbf16> from vector<[8]x[8]xbf16>
+  return %slice : vector<[8]xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_f32
+func.func @vector_extract_slice_f32(%tile: vector<[4]x[4]xf32>, %row: index) -> vector<[4]xf32> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[4]xf32>, vector<[4]xi1>, i32, i32) -> vector<[4]xf32>
+  %slice = vector.extract %tile[%row] : vector<[4]xf32> from vector<[4]x[4]xf32>
+  return %slice : vector<[4]xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_slice_f64
+func.func @vector_extract_slice_f64(%tile: vector<[2]x[2]xf64>, %row: index) -> vector<[2]xf64> {
+  // CHECK: %{{.*}} = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[2]xf64>, vector<[2]xi1>, i32, i32) -> vector<[2]xf64>
+  %slice = vector.extract %tile[%row] : vector<[2]xf64> from vector<[2]x[2]xf64>
+  return %slice : vector<[2]xf64>
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element(
+// CHECK-SAME:                          %[[TILE:.*]]: vector<[4]x[4]xi32>,
+// CHECK-SAME:                          %[[ROW:.*]]: index,
+// CHECK-SAME:                          %[[COL:.*]]: index)
+func.func @vector_extract_element(%tile: vector<[4]x[4]xi32>, %row: index, %col: index) -> i32 {
+  // CHECK-NEXT: %[[ZERO_VEC:.*]] = arith.constant dense<0> : vector<[4]xi32>
+  // CHECK-NEXT: %[[PTRUE:.*]] = arith.constant dense<true> : vector<[4]xi1>
+  // CHECK-NEXT: %[[COL_I32:.*]] = builtin.unrealized_conversion_cast %[[COL]] : index to i64
+  // CHECK-NEXT: %[[TILE_ID:.*]] = arm_sme.cast_vector_to_tile %[[TILE]] : vector<[4]x[4]xi32> to i32
+  // CHECK-NEXT: %[[ROW_I32:.*]] = arith.index_cast %[[ROW]] : index to i32
+  // CHECK-NEXT: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%[[ZERO_VEC]], %[[PTRUE]], %[[TILE_ID]], %[[ROW_I32]]) : (vector<[4]xi32>, vector<[4]xi1>, i32, i32) -> vector<[4]xi32>
+  // CHECK-NEXT: %[[EL:.*]] = llvm.extractelement %[[SLICE]]{{\[}}%[[COL_I32]] : i64] : vector<[4]xi32>
+  %el = vector.extract %tile[%row, %col] : i32 from vector<[4]x[4]xi32>
+  return %el : i32
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element_i8
+func.func @vector_extract_element_i8(%tile: vector<[16]x[16]xi8>, %row: index, %col: index) -> i8 {
+  // CHECK: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[16]xi8>, vector<[16]xi1>, i32, i32) -> vector<[16]xi8>
+  // CHECK-NEXT: %{{.*}} = llvm.extractelement %[[SLICE]]{{\[}}%{{.*}} : i64] : vector<[16]xi8>
+  %el = vector.extract %tile[%row, %col] : i8 from vector<[16]x[16]xi8>
+  return %el : i8
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element_i16
+func.func @vector_extract_element_i16(%tile: vector<[8]x[8]xi16>, %row: index, %col: index) -> i16 {
+  // CHECK: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xi16>, vector<[8]xi1>, i32, i32) -> vector<[8]xi16>
+  // CHECK-NEXT: %{{.*}} = llvm.extractelement %[[SLICE]]{{\[}}%{{.*}} : i64] : vector<[8]xi16>
+  %el = vector.extract %tile[%row, %col] : i16 from vector<[8]x[8]xi16>
+  return %el : i16
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element_i64
+func.func @vector_extract_element_i64(%tile: vector<[2]x[2]xi64>, %row: index, %col: index) -> i64 {
+  // CHECK: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[2]xi64>, vector<[2]xi1>, i32, i32) -> vector<[2]xi64>
+  // CHECK-NEXT: %{{.*}} = llvm.extractelement %[[SLICE]]{{\[}}%{{.*}} : i64] : vector<[2]xi64>
+  %el = vector.extract %tile[%row, %col] : i64 from vector<[2]x[2]xi64>
+  return %el : i64
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element_i128
+func.func @vector_extract_element_i128(%tile: vector<[1]x[1]xi128>, %row: index, %col: index) -> i128 {
+  // CHECK: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[1]xi128>, vector<[1]xi1>, i32, i32) -> vector<[1]xi128>
+  // CHECK-NEXT: %{{.*}} = llvm.extractelement %[[SLICE]]{{\[}}%{{.*}} : i64] : vector<[1]xi128>
+  %el = vector.extract %tile[%row, %col] : i128 from vector<[1]x[1]xi128>
+  return %el : i128
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element_f16
+func.func @vector_extract_element_f16(%tile: vector<[8]x[8]xf16>, %row: index, %col: index) -> f16 {
+  // CHECK: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xf16>, vector<[8]xi1>, i32, i32) -> vector<[8]xf16>
+  // CHECK-NEXT: %{{.*}} = llvm.extractelement %[[SLICE]]{{\[}}%{{.*}} : i64] : vector<[8]xf16>
+  %el = vector.extract %tile[%row, %col] : f16 from vector<[8]x[8]xf16>
+  return %el : f16
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element_bf16
+func.func @vector_extract_element_bf16(%tile: vector<[8]x[8]xbf16>, %row: index, %col: index) -> bf16 {
+  // CHECK: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[8]xbf16>, vector<[8]xi1>, i32, i32) -> vector<[8]xbf16>
+  // CHECK-NEXT: %{{.*}} = llvm.extractelement %[[SLICE]]{{\[}}%{{.*}} : i64] : vector<[8]xbf16>
+  %el = vector.extract %tile[%row, %col] : bf16 from vector<[8]x[8]xbf16>
+  return %el : bf16
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element_f32
+func.func @vector_extract_element_f32(%tile: vector<[4]x[4]xf32>, %row: index, %col: index) -> f32 {
+  // CHECK: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[4]xf32>, vector<[4]xi1>, i32, i32) -> vector<[4]xf32>
+  // CHECK-NEXT: %{{.*}} = llvm.extractelement %[[SLICE]]{{\[}}%{{.*}} : i64] : vector<[4]xf32>
+  %el = vector.extract %tile[%row, %col] : f32 from vector<[4]x[4]xf32>
+  return %el : f32
+}
+
+// -----
+
+// CHECK-LABEL: @vector_extract_element_f64
+func.func @vector_extract_element_f64(%tile: vector<[2]x[2]xf64>, %row: index, %col: index) -> f64 {
+  // CHECK: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<[2]xf64>, vector<[2]xi1>, i32, i32) -> vector<[2]xf64>
+  // CHECK-NEXT: %{{.*}} = llvm.extractelement %[[SLICE]]{{\[}}%{{.*}} : i64] : vector<[2]xf64>
+  %el = vector.extract %tile[%row, %col] : f64 from vector<[2]x[2]xf64>
+  return %el : f64
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-region-branchop-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-region-branchop-interface.mlir
index dc372749fc074b..ad7c4c783e907f 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-region-branchop-interface.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-region-branchop-interface.mlir
@@ -270,7 +270,8 @@ func.func @loop_alloc(
 //       CHECK: [[V0:%.+]]:2 = scf.for {{.*}} iter_args([[ARG6:%.+]] = [[ARG3]], [[ARG7:%.+]] = %false
 //       CHECK:   [[ALLOC1:%.+]] = memref.alloc()
 //       CHECK:   [[BASE:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[ARG6]]
-//       CHECK:   bufferization.dealloc ([[BASE]] :{{.*}}) if ([[ARG7]]) retain ([[ALLOC1]] :
+//       CHECK:   bufferization.dealloc ([[BASE]] :{{.*}}) if ([[ARG7]])
+//   CHECK-NOT:       retain
 //       CHECK:   scf.yield [[ALLOC1]], %true
 //       CHECK: test.copy
 //       CHECK: [[BASE:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[V0]]#0
@@ -563,8 +564,8 @@ func.func @while_two_arg(%arg0: index) {
 //       CHECK: ^bb0([[ARG1:%.+]]: memref<?xf32>, [[ARG2:%.+]]: memref<?xf32>, [[ARG3:%.+]]: i1, [[ARG4:%.+]]: i1):
 //       CHECK:   [[ALLOC1:%.+]] = memref.alloc(
 //       CHECK:   [[BASE:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[ARG2]]
-//       CHECK:   [[OWN:%.+]]:2 = bufferization.dealloc ([[BASE]] :{{.*}}) if ([[ARG4]]) retain ([[ARG1]], [[ALLOC1]] :
-//       CHECK:   [[OWN_AGG:%.+]] = arith.ori [[OWN]]#0, [[ARG3]]
+//       CHECK:   [[OWN:%.+]] = bufferization.dealloc ([[BASE]] :{{.*}}) if ([[ARG4]]) retain ([[ARG1]] :
+//       CHECK:   [[OWN_AGG:%.+]] = arith.ori [[OWN]], [[ARG3]]
 //       CHECK:   scf.yield [[ARG1]], [[ALLOC1]], [[OWN_AGG]], %true
 //       CHECK: [[BASE0:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[V0]]#0
 //       CHECK: [[BASE1:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[V0]]#1
@@ -594,10 +595,10 @@ func.func @while_three_arg(%arg0: index) {
 //       CHECK:   [[ALLOC1:%.+]] = memref.alloc(
 //       CHECK:   [[ALLOC2:%.+]] = memref.alloc(
 //       CHECK:   [[BASE0:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[ARG1]]
-//       CHECK:   [[BASE1:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[ARG2]]
 //       CHECK:   [[BASE2:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[ARG3]]
-//       CHECK:   [[OWN:%.+]]:3 = bufferization.dealloc ([[BASE0]], [[BASE1]], [[BASE2]], [[ALLOC1]] :{{.*}}) if ([[ARG4]], [[ARG5]], [[ARG6]], %true{{[0-9_]*}}) retain ([[ALLOC2]], [[ALLOC1]], [[ARG2]] :
-//       CHECK:   scf.yield [[ALLOC2]], [[ALLOC1]], [[ARG2]], %true{{[0-9_]*}}, %true{{[0-9_]*}}, [[OWN]]#2 :
+//       CHECK:   [[OWN:%.+]] = bufferization.dealloc ([[BASE0]], [[BASE2]] :{{.*}}) if ([[ARG4]], [[ARG6]]) retain ([[ARG2]] :
+//       CHECK:   [[OWN_AGG:%.+]] = arith.ori [[OWN]], [[ARG5]]
+//       CHECK:   scf.yield [[ALLOC2]], [[ALLOC1]], [[ARG2]], %true{{[0-9_]*}}, %true{{[0-9_]*}}, [[OWN_AGG]] :
 //       CHECK: }
 //       CHECK: [[BASE0:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[V0]]#0
 //       CHECK: [[BASE1:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[V0]]#1
diff --git a/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation-simplification.mlir b/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation-simplification.mlir
index 98eb038df30a3b..e192e9870becdb 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation-simplification.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation-simplification.mlir
@@ -133,3 +133,30 @@ func.func @dealloc_remove_dealloc_memref_contained_in_retained_with_const_true_c
 //       CHECK:   [[BASE:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[ARG2]]
 //       CHECK:   bufferization.dealloc ([[BASE]] :{{.*}}) if (%true{{[0-9_]*}})
 //  CHECK-NEXT:   return [[ARG0]], [[ARG1]], %true{{[0-9_]*}}, %true{{[0-9_]*}} :
+
+// -----
+
+func.func @alloc_and_bbarg(%arg0: memref<5xf32>, %arg1: index, %arg2: index, %arg3: index) -> f32 {
+  %true = arith.constant true
+  %false = arith.constant false
+  %0:2 = scf.for %arg4 = %arg1 to %arg2 step %arg3 iter_args(%arg5 = %arg0, %arg6 = %false) -> (memref<5xf32>, i1) {
+    %alloc = memref.alloc() : memref<5xf32>
+    memref.copy %arg5, %alloc : memref<5xf32> to memref<5xf32>
+    %base_buffer_0, %offset_1, %sizes_2, %strides_3 = memref.extract_strided_metadata %arg5 : memref<5xf32> -> memref<f32>, index, index, index
+    %2 = bufferization.dealloc (%base_buffer_0, %alloc : memref<f32>, memref<5xf32>) if (%arg6, %true) retain (%alloc : memref<5xf32>)
+    scf.yield %alloc, %2 : memref<5xf32>, i1
+  }
+  %1 = memref.load %0#0[%arg1] : memref<5xf32>
+  %base_buffer, %offset, %sizes, %strides = memref.extract_strided_metadata %0#0 : memref<5xf32> -> memref<f32>, index, index, index
+  bufferization.dealloc (%base_buffer : memref<f32>) if (%0#1)
+  return %1 : f32
+}
+
+// CHECK-LABEL: func @alloc_and_bbarg
+//       CHECK:   %[[true:.*]] = arith.constant true
+//       CHECK:   scf.for {{.*}} iter_args(%[[iter:.*]] = %{{.*}}, %{{.*}} = %{{.*}})
+//       CHECK:     %[[alloc:.*]] = memref.alloc
+//       CHECK:     %[[view:.*]], %{{.*}}, %{{.*}}, %{{.*}} =  memref.extract_strided_metadata %[[iter]]
+//       CHECK:     bufferization.dealloc (%[[view]] : memref<f32>)
+//   CHECK-NOT:     retain
+//       CHECK:     scf.yield %[[alloc]], %[[true]]
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
index 41e43047657daf..b68682a459ed2c 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -317,3 +317,14 @@ func.func @linalg_copy(%t: tensor<5xf32>, %f: f32) -> tensor<5xf32> {
   %1 = linalg.copy ins(%filled : tensor<5xf32>) outs(%t : tensor<5xf32>) -> tensor<5xf32>
   return %1 : tensor<5xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @linalg_copy_empty(
+// CHECK: %[[ret:.*]] = memref.alloc()
+// CHECK-NEXT: return %[[ret]]
+func.func @linalg_copy_empty() -> tensor<26xi32> {
+  %0 = tensor.empty() : tensor<26xi32>
+  %1 = linalg.copy ins(%0 : tensor<26xi32>) outs(%0 : tensor<26xi32>) -> tensor<26xi32>
+  return %1 : tensor<26xi32>
+}
diff --git a/mlir/test/Dialect/Bufferization/invalid.mlir b/mlir/test/Dialect/Bufferization/invalid.mlir
index 3dfd1eb17e8d64..8004ec632453e8 100644
--- a/mlir/test/Dialect/Bufferization/invalid.mlir
+++ b/mlir/test/Dialect/Bufferization/invalid.mlir
@@ -87,3 +87,11 @@ func.func @invalid_dealloc_wrong_number_of_results(%arg0: memref<2xf32>, %arg1:
   %0:2 = bufferization.dealloc (%arg0, %arg1 : memref<2xf32>, memref<4xi32>) if (%arg2, %arg2) retain (%arg1 : memref<4xi32>)
   return %0#0 : i1
 }
+
+// -----
+
+func.func @invalid_dealloc_wrong_number_of_results(%arg0: memref<2xf32>, %arg1: memref<4xi32>, %arg2: i1) -> i1 {
+  // expected-error @below{{must have the same number of updated conditions (results) as retained operands}}
+  %0:3 = "bufferization.dealloc"(%arg0, %arg1, %arg2, %arg2, %arg1) <{operandSegmentSizes = array<i32: 2, 2, 1>}> : (memref<2xf32>, memref<4xi32>, i1, i1, memref<4xi32>) -> (i1, i1, i1)
+  return %0#0 : i1
+}
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
index 66d7fda2230ce8..c69701b65e2064 100644
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
@@ -160,8 +160,7 @@ func.func @matmul(
   %c16 = arith.constant 16 : index
 
   // Hoisted alloc.
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<128x192xf32>
-  // CHECK: memref.copy %[[C]], %[[ALLOC]]
+  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x16xf32>
 
   // CHECK: scf.for %[[I:.*]] =
   %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
@@ -173,14 +172,12 @@ func.func @matmul(
       %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
         tensor<256x192xf32> to tensor<256x16xf32>
 
-      // C was already replaced with a copy by preprocessing, so no copy is
-      // needed here.
-      // CHECK: %[[C_SLICE:.*]] = memref.subview %[[ALLOC]]
+      // Insert an artificial out-of-place buffer by extracting from %C instead
+      // of %arg6.
       %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<128x192xf32> to tensor<8x16xf32>
 
-      // linalg.fill is inplace.
-      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[C_SLICE]]
+      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]]
       %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>
 
       // CHECK: scf.for %[[K:.*]] =
@@ -191,7 +188,7 @@ func.func @matmul(
           tensor<256x16xf32> to tensor<32x16xf32>
 
         // linalg.matmul is inplace as well as the enclosing scf.for.
-        // CHECK: linalg.matmul ins({{.*}} outs(%[[C_SLICE]]
+        // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
         %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
                            outs(%arg8 : tensor<8x16xf32>)
           -> tensor<8x16xf32>
@@ -202,7 +199,7 @@ func.func @matmul(
       // that is not in place. So we must insert a copy of the small buffer into
       // the bigger buffer.
       // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
-      // CHECK: memref.copy %[[C_SLICE]], %[[T]]
+      // CHECK: memref.copy %[[ALLOC]], %[[T]]
       %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<8x16xf32> into tensor<128x192xf32>
 
diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
index c0050d8c510d53..6ed97f05aa7cff 100644
--- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
@@ -155,3 +155,22 @@ func.func @rank_zero_memref() -> i4 {
 //       CHECK32:   %[[LOAD:.+]] = memref.load %[[ALLOC]][] : memref<i32>
 //       CHECK32:   %[[TRUNC:.+]] = arith.trunci %[[LOAD]] : i32 to i4
 //       CHECK32:   return %[[TRUNC]]
+
+// -----
+
+func.func @memref_strided_i4(%idx : index) -> i4 {
+  %arr = memref.alloc() : memref<128xi4>
+  %subview = memref.subview %arr[32] [32] [1] : memref<128xi4> to memref<32xi4, strided<[1], offset:32>>
+  %1 = memref.load %subview[%idx] : memref<32xi4, strided<[1], offset:32>>
+  return %1 : i4
+}
+
+// CHECK-LABEL: func @memref_strided_i4
+//       CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<64xi8>
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][16] [16] [1] : memref<64xi8> to memref<16xi8, strided<[1], offset: 16>>
+//       CHECK:   %[[LOAD:.+]] = memref.load %[[SUBVIEW]]
+
+// CHECK32-LABEL: func @memref_strided_i4
+//       CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<16xi32>
+//       CHECK32:   %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][4] [4] [1] : memref<16xi32> to memref<4xi32, strided<[1], offset: 4>>
+//       CHECK32:   %[[LOAD:.+]] = memref.load %[[SUBVIEW]]
diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
index a6303aa2d97110..ab0c78a8ba7669 100644
--- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
+++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
@@ -1369,3 +1369,128 @@ func.func @extract_strided_metadata_of_get_global_with_offset()
   return %base, %offset, %sizes#0, %sizes#1, %strides#0, %strides#1 :
       memref<i32>, index, index, index, index, index
 }
+
+// -----
+
+// Check that we simplify extract_strided_metadata of cast
+// when the source of the cast is compatible with what
+// `extract_strided_metadata`s accept.
+//
+// When we apply the transformation the resulting offset, sizes and strides
+// should come straight from the inputs of the cast.
+// Additionally the folder on extract_strided_metadata should propagate the
+// static information.
+//
+// CHECK-LABEL: func @extract_strided_metadata_of_cast
+//  CHECK-SAME: %[[ARG:.*]]: memref<3x?xi32, strided<[4, ?], offset: ?>>)
+//
+//   CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+//       CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]]
+//
+//       CHECK: return %[[BASE]], %[[DYN_OFFSET]], %[[C3]], %[[DYN_SIZES]]#1, %[[C4]], %[[DYN_STRIDES]]#1
+func.func @extract_strided_metadata_of_cast(
+  %arg : memref<3x?xi32, strided<[4, ?], offset:?>>)
+  -> (memref<i32>, index,
+      index, index,
+      index, index) {
+
+  %cast =
+    memref.cast %arg :
+      memref<3x?xi32, strided<[4, ?], offset: ?>> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %base, %base_offset, %sizes:2, %strides:2 =
+    memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>>
+    -> memref<i32>, index,
+       index, index,
+       index, index
+
+  return %base, %base_offset,
+    %sizes#0, %sizes#1,
+    %strides#0, %strides#1 :
+      memref<i32>, index,
+      index, index,
+      index, index
+}
+
+// -----
+
+// Check that we simplify extract_strided_metadata of cast
+// when the source of the cast is compatible with what
+// `extract_strided_metadata`s accept.
+//
+// Same as extract_strided_metadata_of_cast but with constant sizes and strides
+// in the destination type.
+//
+// CHECK-LABEL: func @extract_strided_metadata_of_cast_w_csts
+//  CHECK-SAME: %[[ARG:.*]]: memref<?x?xi32, strided<[?, ?], offset: ?>>)
+//
+//   CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+//   CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index
+//   CHECK-DAG: %[[C25:.*]] = arith.constant 25 : index
+//       CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]]
+//
+//       CHECK: return %[[BASE]], %[[C25]], %[[C4]], %[[DYN_SIZES]]#1, %[[DYN_STRIDES]]#0, %[[C18]]
+func.func @extract_strided_metadata_of_cast_w_csts(
+  %arg : memref<?x?xi32, strided<[?, ?], offset:?>>)
+  -> (memref<i32>, index,
+      index, index,
+      index, index) {
+
+  %cast =
+    memref.cast %arg :
+      memref<?x?xi32, strided<[?, ?], offset: ?>> to
+      memref<4x?xi32, strided<[?, 18], offset: 25>>
+
+  %base, %base_offset, %sizes:2, %strides:2 =
+    memref.extract_strided_metadata %cast:memref<4x?xi32, strided<[?, 18], offset: 25>>
+    -> memref<i32>, index,
+       index, index,
+       index, index
+
+  return %base, %base_offset,
+    %sizes#0, %sizes#1,
+    %strides#0, %strides#1 :
+      memref<i32>, index,
+      index, index,
+      index, index
+}
+// -----
+
+// Check that we don't simplify extract_strided_metadata of
+// cast when the source of the cast is unranked.
+// Unranked memrefs cannot feed into extract_strided_metadata operations.
+// Note: Technically we could still fold the sizes and strides.
+//
+// CHECK-LABEL: func @extract_strided_metadata_of_cast_unranked
+//  CHECK-SAME: %[[ARG:.*]]: memref<*xi32>)
+//
+//       CHECK: %[[CAST:.*]] = memref.cast %[[ARG]] :
+//       CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[CAST]]
+//
+//       CHECK: return %[[BASE]], %[[OFFSET]], %[[SIZES]]#0, %[[SIZES]]#1, %[[STRIDES]]#0, %[[STRIDES]]#1
+func.func @extract_strided_metadata_of_cast_unranked(
+  %arg : memref<*xi32>)
+  -> (memref<i32>, index,
+      index, index,
+      index, index) {
+
+  %cast =
+    memref.cast %arg :
+      memref<*xi32> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %base, %base_offset, %sizes:2, %strides:2 =
+    memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>>
+    -> memref<i32>, index,
+       index, index,
+       index, index
+
+  return %base, %base_offset,
+    %sizes#0, %sizes#1,
+    %strides#0, %strides#1 :
+      memref<i32>, index,
+      index, index,
+      index, index
+}
diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir
index ff391e469815d7..66652070ec15f3 100644
--- a/mlir/test/Dialect/NVGPU/invalid.mlir
+++ b/mlir/test/Dialect/NVGPU/invalid.mlir
@@ -225,8 +225,8 @@ func.func @async_cp_size_invalid_f64(
 // -----
 
 !tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
-!tDescA  = !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>
-!tDescB  = !nvgpu.wgmma.descriptor<tensor = memref<64x121xf16, 3>>
+!tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
+!tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x121xf16, 3>>
 
 func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc1: !tResult, %acc2: !tResult) {
   // expected-error @+1 {{'nvgpu.warpgroup.mma' op 2nd dim matrix-B ( 121 ) != 2nd dim matrix-C ( 128 )}}  
@@ -237,8 +237,8 @@ func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc1: !t
 // -----
 
 !tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128xf32>>
-!tDescA  = !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>
-!tDescB  = !nvgpu.wgmma.descriptor<tensor = memref<64x128xf16, 3>>
+!tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
+!tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>
 func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc1: !tResult, %acc2: !tResult) {
   // expected-error @+1 {{'nvgpu.warpgroup.mma' op has matrices A, B, C and D, they must be 2 dimensional}}  
   %0:2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc1: !tDescA, !tDescB, !tResult, !tResult -> !tResult, !tResult
@@ -247,8 +247,8 @@ func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc1: !t
 
 // -----
 !tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
-!tDescA  = !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>
-!tDescB  = !nvgpu.wgmma.descriptor<tensor = memref<64x128xf32, 3>>
+!tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
+!tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf32, 3>>
 func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc1: !tResult, %acc2: !tResult) {
   // expected-error @+1 {{'nvgpu.warpgroup.mma' op 'f32' += 'f16' * 'f32', it is not supported.}}  
   %0:2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc1: !tDescA, !tDescB, !tResult, !tResult -> !tResult, !tResult
@@ -258,8 +258,8 @@ func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc1: !t
 // -----
 
 !tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
-!tDescA  = !nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>
-!tDescB  = !nvgpu.wgmma.descriptor<tensor = memref<64x512xf16, 3>>
+!tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
+!tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x512xf16, 3>>
 func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc1: !tResult, %acc2: !tResult) {
   // expected-error @+1 {{'nvgpu.warpgroup.mma' op 2nd dim matrix-B ( 512 ) != 2nd dim matrix-C ( 128 )}}
   %0:2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc1: !tDescA, !tDescB, !tResult, !tResult -> !tResult, !tResult
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
index 1db155b2db3824..24da8d84b18e26 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -253,11 +253,10 @@ func.func @scf_execute_region_yield_non_equivalent(%i: index, %j: index) -> f32
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32
 //       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
 //       CHECK:   memref.copy %[[t]], %[[alloc]]
-//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[t]])
+//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[alloc]])
 //   CHECK-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[alloc]], %[[alloc2]]
-//       CHECK:     %[[alloc2_casted:.*]] = memref.cast %[[alloc2]]
-//       CHECK:     scf.yield %[[alloc2_casted]]
+//       CHECK:     memref.copy %[[t]], %[[alloc2]]
+//       CHECK:     scf.yield %[[alloc2]]
 //       CHECK:   return %[[for]]
 func.func @scf_for_yield_non_equivalent(
     %t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
@@ -270,20 +269,12 @@ func.func @scf_for_yield_non_equivalent(
 
 // -----
 
-// Note: This bufferizes to inefficient code, but bufferization should not see
-// such IR in the first place. The iter_arg would canonicalize away. This test
-// case is just to ensure that the bufferization generates correct code.
-
 // CHECK-LABEL: func @scf_for_yield_allocation(
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32
 //       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[t]])
-// This alloc is for the bufferization.alloc_tensor.
-//   CHECK-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
-// This alloc is for the scf.yield.
-//       CHECK:     %[[alloc3:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[alloc2]], %[[alloc3]]
-//       CHECK:     %[[casted3:.*]] = memref.cast %[[alloc3]]
-//       CHECK:     scf.yield %[[casted3]]
+//   CHECK-DAG:     %[[alloc:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     %[[casted:.*]] = memref.cast %[[alloc]]
+//       CHECK:     scf.yield %[[casted]]
 //       CHECK:   return %[[for]]
 func.func @scf_for_yield_allocation(%t: tensor<?xf32>, %lb : index, %ub : index,
                                %step : index) -> tensor<?xf32> {
@@ -606,9 +597,9 @@ func.func @scf_foreach_private_var(%t: tensor<10xf32>) -> f32 {
 
   // CHECK: scf.forall (%{{.*}}) in (2) {
 
-  // Load from the copy and store into the shared output.
-  // CHECK:   %[[subview:.*]] = memref.subview %[[t]]
-  // CHECK:   memref.load %[[t_copy]]
+  // Load from the original and store into the copy.
+  // CHECK:   %[[subview:.*]] = memref.subview %[[t_copy]]
+  // CHECK:   memref.load %[[t]]
   // CHECK:   memref.store %{{.*}}, %[[subview]]
   %0 = scf.forall (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
@@ -752,14 +743,16 @@ func.func @scf_for_yield_alias_of_non_equivalent(%sz: index) -> tensor<?xf32> {
   // CHECK: scf.for
   %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %0) -> tensor<?xf32> {
     %iv_sub = arith.subi %iv, %c1 : index
-    // CHECK: memref.subview %[[generate_copy]]
+    // CHECK: memref.subview %[[generate]]
     %ll = tensor.extract_slice %0[%iv_sub][%sz][1] : tensor<?xf32> to tensor<?xf32>
     %l = tensor.extract %ll[%c0] : tensor<?xf32>
     %double = arith.mulf %cst, %l : f32
-    // CHECK: memref.store %{{.*}}, %[[generate]]
+    // CHECK: memref.store %{{.*}}, %[[generate_copy]]
     %s = tensor.insert %double into %t[%iv] : tensor<?xf32>
     scf.yield %s : tensor<?xf32>
   }
+
+  // CHECK: return %[[generate_copy]]
   return %r : tensor<?xf32>
 }
 
diff --git a/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir b/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir
index c96a55aa1e8b2f..cafb431b753063 100644
--- a/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir
+++ b/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir
@@ -84,7 +84,7 @@ func.func @sparse_push_back_inbound(%arg0: index, %arg1: memref<?xf64>, %arg2: f
 // CHECK-DAG:     func.func private @_sparse_qsort_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
 // CHECK-LABEL:   func.func @sparse_sort_coo_quick
 func.func @sparse_sort_coo_quick(%arg0: index, %arg1: memref<100xindex>, %arg2: memref<?xf32>, %arg3: memref<10xi32>) -> (memref<100xindex>, memref<?xf32>, memref<10xi32>) {
-  sparse_tensor.sort_coo quick_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
+  sparse_tensor.sort quick_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
   return %arg1, %arg2, %arg3 : memref<100xindex>, memref<?xf32>, memref<10xi32>
 }
 
@@ -103,7 +103,7 @@ func.func @sparse_sort_coo_quick(%arg0: index, %arg1: memref<100xindex>, %arg2:
 // CHECK-DAG:     func.func private @_sparse_hybrid_qsort_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>, %arg5: i64) {
 // CHECK-LABEL:   func.func @sparse_sort_coo_hybrid
 func.func @sparse_sort_coo_hybrid(%arg0: index, %arg1: memref<100xindex>, %arg2: memref<?xf32>, %arg3: memref<10xi32>) -> (memref<100xindex>, memref<?xf32>, memref<10xi32>) {
-  sparse_tensor.sort_coo hybrid_quick_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
+  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
   return %arg1, %arg2, %arg3 : memref<100xindex>, memref<?xf32>, memref<10xi32>
 }
 
@@ -118,7 +118,7 @@ func.func @sparse_sort_coo_hybrid(%arg0: index, %arg1: memref<100xindex>, %arg2:
 // CHECK-DAG:     func.func private @_sparse_sort_stable_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
 // CHECK-LABEL:   func.func @sparse_sort_coo_stable
 func.func @sparse_sort_coo_stable(%arg0: index, %arg1: memref<100xindex>, %arg2: memref<?xf32>, %arg3: memref<10xi32>) -> (memref<100xindex>, memref<?xf32>, memref<10xi32>) {
-  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
+  sparse_tensor.sort insertion_sort_stable %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
   return %arg1, %arg2, %arg3 : memref<100xindex>, memref<?xf32>, memref<10xi32>
 }
 
@@ -133,6 +133,6 @@ func.func @sparse_sort_coo_stable(%arg0: index, %arg1: memref<100xindex>, %arg2:
 // CHECK-DAG:     func.func private @_sparse_heap_sort_0_1_index_coo_1_f32_i32(%arg0: index, %arg1: index, %arg2: memref<?xindex>, %arg3: memref<?xf32>, %arg4: memref<?xi32>) {
 // CHECK-LABEL:   func.func @sparse_sort_coo_heap
 func.func @sparse_sort_coo_heap(%arg0: index, %arg1: memref<100xindex>, %arg2: memref<?xf32>, %arg3: memref<10xi32>) -> (memref<100xindex>, memref<?xf32>, memref<10xi32>) {
-  sparse_tensor.sort_coo heap_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
+  sparse_tensor.sort heap_sort %arg0, %arg1 jointly %arg2, %arg3 {perm_map = #ID_MAP, ny = 1: index} : memref<100xindex> jointly memref<?xf32>, memref<10xi32>
   return %arg1, %arg2, %arg3 : memref<100xindex>, memref<?xf32>, memref<10xi32>
 }
diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index 69a9c274a861ce..4ac768c21aff8f 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -423,7 +423,7 @@ func.func @sparse_expansion3(%arg0: index, %arg1: index) -> memref<?xindex> {
 //   CHECK-DAG: %[[A9:.*]] = arith.constant 0.000000e+00 : f64
 //   CHECK-DAG: %[[A10:.*]] = arith.constant 1 : index
 //   CHECK-DAG: %[[A11:.*]] = arith.constant 0 : index
-//       CHECK: sparse_tensor.sort_coo hybrid_quick_sort %[[A7]], %[[A6]]
+//       CHECK: sparse_tensor.sort hybrid_quick_sort %[[A7]], %[[A6]]
 //       CHECK: %[[A12:.*]]:4 = scf.for %[[A13:.*]] = %[[A11]] to %[[A7]] step %[[A10]] iter_args(%[[A14:.*]] = %[[A0]], %[[A15:.*]] = %[[A1]], %[[A16:.*]] = %[[A2]], %[[A17:.*]] = %[[A3]])
 //       CHECK:   %[[A18:.*]] = memref.load %[[A6]]{{\[}}%[[A13]]] : memref<?xindex>
 //       CHECK:   %[[A19:.*]] = memref.load %[[A4]]{{\[}}%[[A18]]] : memref<?xf64>
@@ -471,7 +471,7 @@ func.func @sparse_compression_1d(%tensor: tensor<100xf64, #SV>,
 //       CHECK:     %[[A11:.*]] = arith.constant 0.000000e+00 : f64
 //       CHECK:     %[[A12:.*]] = arith.constant 1 : index
 //       CHECK:     %[[A13:.*]] = arith.constant 0 : index
-//       CHECK:     sparse_tensor.sort_coo hybrid_quick_sort %[[A7]], %[[A6]]
+//       CHECK:     sparse_tensor.sort hybrid_quick_sort %[[A7]], %[[A6]]
 //       CHECK:     %[[A14:.*]]:4 = scf.for %[[A15:.*]] = %[[A13]] to %[[A7]] step %[[A12]] iter_args(%[[A16:.*]] = %[[A0]], %[[A17:.*]] = %[[A1]], %[[A18:.*]] = %[[A2]], %[[A19:.*]] = %[[A3]]) -> (memref<?xi32>, memref<?xi64>, memref<?xf64>, !sparse_tensor.storage_specifier
 //       CHECK:       %[[A20:.*]] = memref.load %[[A6]]{{\[}}%[[A15]]] : memref<?xindex>
 //       CHECK:       %[[A21:.*]] = memref.load %[[A4]]{{\[}}%[[A20]]] : memref<?xf64>
@@ -507,7 +507,7 @@ func.func @sparse_compression(%tensor: tensor<8x8xf64, #CSR>,
   return %1 : tensor<8x8xf64, #CSR>
 }
 
-// CHECK-LABEL: func.func private @_insert_dense_compressed_no_8_8_f64_0_0(
+// CHECK-LABEL: func.func private @_insert_dense_compressed_nonordered_8_8_f64_0_0(
 //  CHECK-SAME: %[[A1:.*0]]: memref<?xindex>,
 //  CHECK-SAME: %[[A2:.*1]]: memref<?xindex>,
 //  CHECK-SAME: %[[A3:.*2]]: memref<?xf64>,
@@ -533,7 +533,7 @@ func.func @sparse_compression(%tensor: tensor<8x8xf64, #CSR>,
 //       CHECK:     %[[A13:.*]]:4 = scf.for %[[A14:.*]] = %[[A11]] to %[[A7]] step %[[A12]] iter_args(%[[A15:.*]] = %[[A0]], %[[A16:.*]] = %[[A1]], %[[A17:.*]] = %[[A2]], %[[A18:.*]] = %[[A3]]) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
 //       CHECK:       %[[A19:.*]] = memref.load %[[A6]]{{\[}}%[[A14]]] : memref<?xindex>
 //       CHECK:       %[[A20:.*]] = memref.load %[[A4]]{{\[}}%[[A19]]] : memref<?xf64>
-//       CHECK:       %[[A21:.*]]:4 = func.call @_insert_dense_compressed_no_8_8_f64_0_0(%[[A15]], %[[A16]], %[[A17]], %[[A18]], %[[A8]], %[[A19]], %[[A20]]) : (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
+//       CHECK:       %[[A21:.*]]:4 = func.call @_insert_dense_compressed_nonordered_8_8_f64_0_0(%[[A15]], %[[A16]], %[[A17]], %[[A18]], %[[A8]], %[[A19]], %[[A20]]) : (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
 //       CHECK:       memref.store %[[A10]], %[[A4]]{{\[}}%[[A19]]] : memref<?xf64>
 //       CHECK:       memref.store %[[A9]], %[[A5]]{{\[}}%[[A19]]] : memref<?xi1>
 //       CHECK:       scf.yield %[[A21]]#0, %[[A21]]#1, %[[A21]]#2, %[[A21]]#3 : memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
@@ -611,7 +611,7 @@ func.func @sparse_insert_typed(%arg0: tensor<128xf64, #SparseVector>, %arg1: ind
   return %1 : tensor<128xf64, #SparseVector>
 }
 
-// CHECK-LABEL: func.func private @_insert_compressed_nu_singleton_5_6_f64_0_0(
+// CHECK-LABEL: func.func private @_insert_compressed_nonunique_singleton_5_6_f64_0_0(
 //  CHECK-SAME: %[[A1:.*0]]: memref<?xindex>,
 //  CHECK-SAME: %[[A2:.*1]]: memref<?xindex>,
 //  CHECK-SAME: %[[A3:.*2]]: memref<?xf64>,
@@ -627,7 +627,7 @@ func.func @sparse_insert_typed(%arg0: tensor<128xf64, #SparseVector>, %arg1: ind
 //  CHECK-SAME: %[[A3:.*3]]: !sparse_tensor.storage_specifier
 //  CHECK-SAME: %[[A4:.*4]]: index,
 //  CHECK-SAME: %[[A5:.*5]]: f64)
-//       CHECK: %[[R:.*]]:4 = call @_insert_compressed_nu_singleton_5_6_f64_0_0(%[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A4]], %[[A5]])
+//       CHECK: %[[R:.*]]:4 = call @_insert_compressed_nonunique_singleton_5_6_f64_0_0(%[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A4]], %[[A5]])
 //       CHECK: return %[[R]]#0, %[[R]]#1, %[[R]]#2, %[[R]]#3
 func.func @sparse_insert_coo(%arg0: tensor<5x6xf64, #Coo>, %arg1: index, %arg2: f64) -> tensor<5x6xf64, #Coo> {
   %0 = sparse_tensor.insert %arg2 into %arg0[%arg1, %arg1] : tensor<5x6xf64, #Coo>
@@ -699,7 +699,7 @@ func.func @sparse_convert_element_type(%arg0: tensor<32xf32, #SparseVector>) ->
 //       CHECK: %[[A33:.*]] = call @getSparseTensorReaderReadToBuffers0F32(%[[A5]], %[[A32]], %[[A14]], %[[A15]])
 //       CHECK: %[[A34:.*]] = arith.cmpi eq, %[[A33]], %[[A1]] : i1
 //       CHECK: scf.if %[[A34]] {
-//       CHECK:   sparse_tensor.sort_coo  hybrid_quick_sort %[[A10]], %[[A14]] jointly %[[A15]] {ny = 0 : index, perm_map = #{{.*}}} : memref<?xindex> jointly memref<?xf32>
+//       CHECK:   sparse_tensor.sort  hybrid_quick_sort %[[A10]], %[[A14]] jointly %[[A15]] {ny = 0 : index, perm_map = #{{.*}}} : memref<?xindex> jointly memref<?xf32>
 //       CHECK: }
 //       CHECK: memref.store %[[A10]], %[[A27]]{{\[}}%[[A2]]] : memref<?xindex>
 //       CHECK: %[[A36:.*]] = sparse_tensor.storage_specifier.set %[[A30]]  crd_mem_sz at 0 with %[[A11]]
diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
index 2a2619daf49365..c373fd23bbef49 100644
--- a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
@@ -177,7 +177,7 @@ func.func @sparse_convert_singleton(%arg0: tensor<?xf32, #SparseSingleton64>) ->
 //       CHECK-RWT: %[[VAL_16:.*]] = sparse_tensor.load %[[VAL_17:.*]] hasInserts : tensor<?x?x?xf32, #{{.*}}>>
 //       CHECK-RWT: %[[VAL_18:.*]] = sparse_tensor.values %[[VAL_16]] : tensor<?x?x?xf32, #{{.*}}>> to memref<?xf32>
 //       CHECK-RWT: %[[VAL_19:.*]] = sparse_tensor.coordinates_buffer %[[VAL_16]] : tensor<?x?x?xf32, #{{.*}}>> to memref<?xindex>
-//       CHECK-RWT: sparse_tensor.sort_coo  hybrid_quick_sort %[[VAL_7]], %[[VAL_19]] jointly %[[VAL_18]] {ny = 0 : index, perm_map = #map}
+//       CHECK-RWT: sparse_tensor.sort  hybrid_quick_sort %[[VAL_7]], %[[VAL_19]] jointly %[[VAL_18]] {ny = 0 : index, perm_map = #map}
 //       CHECK-RWT: %[[VAL_20:.*]] = bufferization.alloc_tensor(%[[VAL_4]], %[[VAL_5]], %[[VAL_6]]) size_hint=%[[VAL_7]]
 //       CHECK-RWT: %[[VAL_21:.*]] = sparse_tensor.foreach in %[[VAL_16]] init(%[[VAL_20]])
 //       CHECK-RWT: ^bb0(%[[VAL_22:.*]]: index, %[[VAL_23:.*]]: index, %[[VAL_24:.*]]: index, %[[VAL_25:.*]]: f32, %[[VAL_26:.*]]: tensor<?x?x?xf32, #{{.*}}>>):
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index 3c33ba42f4d388..2df4237efa0bbe 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -794,7 +794,7 @@ func.func @sparse_tensor_foreach(%arg0: tensor<2x4xf64, #DCSR>, %arg1: f32) -> (
 
 func.func @sparse_sort_coo_x_type( %arg0: index, %arg1: memref<?xf32>) {
   // expected-error@+1 {{operand #1 must be 1D memref of integer or index values}}
-  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1 {perm_map = #MAP} : memref<?xf32>
+  sparse_tensor.sort insertion_sort_stable %arg0, %arg1 {perm_map = #MAP} : memref<?xf32>
   return
 }
 
@@ -805,7 +805,7 @@ func.func @sparse_sort_coo_x_type( %arg0: index, %arg1: memref<?xf32>) {
 func.func @sparse_sort_coo_x_too_small(%arg0: memref<50xindex>) {
   %i20 = arith.constant 20 : index
   // expected-error@+1 {{Expected dimension(xy) >= n * (rank(perm_map) + ny) got 50 < 60}}
-  sparse_tensor.sort_coo hybrid_quick_sort %i20, %arg0 {perm_map = #MAP, ny = 1 : index} : memref<50xindex>
+  sparse_tensor.sort hybrid_quick_sort %i20, %arg0 {perm_map = #MAP, ny = 1 : index} : memref<50xindex>
   return
 }
 
@@ -816,7 +816,7 @@ func.func @sparse_sort_coo_x_too_small(%arg0: memref<50xindex>) {
 func.func @sparse_sort_coo_y_too_small(%arg0: memref<60xindex>, %arg1: memref<10xf32>) {
   %i20 = arith.constant 20 : index
   // expected-error@+1 {{Expected dimension(y) >= n got 10 < 20}}
-  sparse_tensor.sort_coo insertion_sort_stable %i20, %arg0 jointly %arg1 {perm_map = #MAP, ny = 1 : index} : memref<60xindex> jointly memref<10xf32>
+  sparse_tensor.sort insertion_sort_stable %i20, %arg0 jointly %arg1 {perm_map = #MAP, ny = 1 : index} : memref<60xindex> jointly memref<10xf32>
   return
 }
 
@@ -826,7 +826,7 @@ func.func @sparse_sort_coo_y_too_small(%arg0: memref<60xindex>, %arg1: memref<10
 
 func.func @sparse_sort_coo_no_perm(%arg0: index, %arg1: memref<?xindex>) -> (memref<?xindex>) {
   // expected-error@+1 {{Expected a permutation map, got (d0, d1) -> (d0, d0)}}
-  sparse_tensor.sort_coo hybrid_quick_sort %arg0, %arg1 {perm_map = #NON_PERM_MAP, ny = 1 : index}: memref<?xindex>
+  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1 {perm_map = #NON_PERM_MAP, ny = 1 : index}: memref<?xindex>
   return %arg1 : memref<?xindex>
 }
 
diff --git a/mlir/test/Dialect/SparseTensor/post_rewriting.mlir b/mlir/test/Dialect/SparseTensor/post_rewriting.mlir
deleted file mode 100644
index 93fc610b64b335..00000000000000
--- a/mlir/test/Dialect/SparseTensor/post_rewriting.mlir
+++ /dev/null
@@ -1,99 +0,0 @@
-// RUN: mlir-opt %s -post-sparsification-rewrite | FileCheck %s
-
-#SparseVector = #sparse_tensor.encoding<{
-  map = (d0) -> (d0 : compressed)
-}>
-
-#SparseMatrix = #sparse_tensor.encoding<{
-  map = (d0, d1) -> (d0 : compressed, d1 : compressed)
-}>
-
-// CHECK-LABEL: func.func @expand_dense(
-// CHECK-SAME:    %[[A:.*]]: tensor<12xf64>) -> tensor<3x4xf64> {
-// CHECK:         %[[E:.*]] = tensor.expand_shape %[[A]] {{.*}} : tensor<12xf64> into tensor<3x4xf64>
-// CHECK:         return %[[E]] : tensor<3x4xf64>
-// CHECK:       }
-func.func @expand_dense(%arg0: tensor<12xf64>) -> tensor<3x4xf64> {
-  %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<12xf64> into tensor<3x4xf64>
-  return %0 : tensor<3x4xf64>
-}
-
-// CHECK-LABEL: func.func @expand_from_sparse(
-// CHECK-SAME:    %[[A:.*]]: tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<3x4xf64> {
-// CHECK:         %[[C:.*]] = sparse_tensor.convert %[[A]] : tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>> to tensor<12xf64>
-// CHECK:         %[[E:.*]] = tensor.expand_shape %[[C]] {{.*}} : tensor<12xf64> into tensor<3x4xf64>
-// CHECK:         return %[[E]] : tensor<3x4xf64>
-// CHECK:       }
-func.func @expand_from_sparse(%arg0: tensor<12xf64, #SparseVector>) -> tensor<3x4xf64> {
-  %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<12xf64, #SparseVector> into tensor<3x4xf64>
-  return %0 : tensor<3x4xf64>
-}
-
-// CHECK-LABEL: func.func @expand_to_sparse(
-// CHECK-SAME:    %[[A:.*]]: tensor<12xf64>) -> tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[E:.*]] = tensor.expand_shape %[[A]] {{.*}} : tensor<12xf64> into tensor<3x4xf64>
-// CHECK:         %[[C:.*]] = sparse_tensor.convert %[[E]] : tensor<3x4xf64> to tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:         return %[[C]] : tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:       }
-func.func @expand_to_sparse(%arg0: tensor<12xf64>) -> tensor<3x4xf64, #SparseMatrix> {
-  %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<12xf64> into tensor<3x4xf64, #SparseMatrix>
-  return %0 : tensor<3x4xf64, #SparseMatrix>
-}
-
-//
-// Not rewritten, needs conversion.
-//
-// CHECK-LABEL:   func.func @expand_sparse2sparse(
-// CHECK-SAME:    %[[A:.*]]: tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[E:.*]] = tensor.expand_shape %[[A]] {{.*}} : tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>> into tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:         return %[[E]] : tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:       }
-func.func @expand_sparse2sparse(%arg0: tensor<12xf64, #SparseVector>) -> tensor<3x4xf64, #SparseMatrix> {
-  %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<12xf64, #SparseVector> into tensor<3x4xf64, #SparseMatrix>
-  return %0 : tensor<3x4xf64, #SparseMatrix>
-}
-
-// CHECK-LABEL: func.func @collapse_dense(
-// CHECK-SAME:    %[[A:.*]]: tensor<3x4xf64>) -> tensor<12xf64> {
-// CHECK:         %[[R:.*]] = tensor.collapse_shape %[[A]] {{.*}} : tensor<3x4xf64> into tensor<12xf64>
-// CHECK:         return %[[R]] : tensor<12xf64>
-// CHECK:       }
-func.func @collapse_dense(%arg0: tensor<3x4xf64>) -> tensor<12xf64> {
-  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<3x4xf64> into tensor<12xf64>
-  return %0 : tensor<12xf64>
-}
-
-// CHECK-LABEL: func.func @collapse_from_sparse(
-// CHECK-SAME:    %[[A:.*]]: tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<12xf64> {
-// CHECK:         %[[C:.*]] = sparse_tensor.convert %[[A]] : tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>> to tensor<3x4xf64>
-// CHECK:         %[[R:.*]] = tensor.collapse_shape %[[C]] {{.*}} : tensor<3x4xf64> into tensor<12xf64>
-// CHECK:         return %[[R]] : tensor<12xf64>
-// CHECK:       }
-func.func @collapse_from_sparse(%arg0: tensor<3x4xf64, #SparseMatrix>) -> tensor<12xf64> {
-  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<3x4xf64, #SparseMatrix> into tensor<12xf64>
-  return %0 : tensor<12xf64>
-}
-
-// CHECK-LABEL: func.func @collapse_to_sparse(
-// CHECK-SAME:    %[[A:.*]]: tensor<3x4xf64>) -> tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[R:.*]] = tensor.collapse_shape %[[A]] {{.*}} : tensor<3x4xf64> into tensor<12xf64>
-// CHECK:         %[[C:.*]] = sparse_tensor.convert %[[R]] : tensor<12xf64> to tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:         return %[[C]] : tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:       }
-func.func @collapse_to_sparse(%arg0: tensor<3x4xf64>) -> tensor<12xf64, #SparseVector> {
-  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<3x4xf64> into tensor<12xf64, #SparseVector>
-  return %0 : tensor<12xf64, #SparseVector>
-}
-
-//
-// Not rewritten, needs conversion.
-//
-// CHECK-LABEL:   func.func @collapse_sparse2sparse(
-// CHECK-SAME:    %[[A:.*]]: tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[C:.*]] = tensor.collapse_shape %[[A]] {{.*}} : tensor<3x4xf64, #sparse_tensor.encoding<{{{.*}}}>> into tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:         return %[[C]] : tensor<12xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:       }
-func.func @collapse_sparse2sparse(%arg0: tensor<3x4xf64, #SparseMatrix>) -> tensor<12xf64, #SparseVector> {
-  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<3x4xf64, #SparseMatrix> into tensor<12xf64, #SparseVector>
-  return %0 : tensor<12xf64, #SparseVector>
-}
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index 317a0735fb8bf8..82267be34b9384 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -612,10 +612,10 @@ func.func @sparse_tensor_foreach(%arg0: tensor<2x4xf64, #DCSR>, %arg1: f32) -> (
 // CHECK-LABEL: func @sparse_sort_coo(
 //  CHECK-SAME: %[[A:.*]]: index,
 //  CHECK-SAME: %[[B:.*]]: memref<?xindex>)
-//       CHECK: sparse_tensor.sort_coo hybrid_quick_sort %[[A]], %[[B]] {ny = 1 : index, perm_map = #{{.*}}} : memref<?xindex>
+//       CHECK: sparse_tensor.sort hybrid_quick_sort %[[A]], %[[B]] {ny = 1 : index, perm_map = #{{.*}}} : memref<?xindex>
 //       CHECK: return %[[B]]
 func.func @sparse_sort_coo(%arg0: index, %arg1: memref<?xindex>) -> (memref<?xindex>) {
-  sparse_tensor.sort_coo hybrid_quick_sort %arg0, %arg1 {perm_map = #ID_MAP, ny = 1 : index}: memref<?xindex>
+  sparse_tensor.sort hybrid_quick_sort %arg0, %arg1 {perm_map = #ID_MAP, ny = 1 : index}: memref<?xindex>
   return %arg1 : memref<?xindex>
 }
 
@@ -627,9 +627,9 @@ func.func @sparse_sort_coo(%arg0: index, %arg1: memref<?xindex>) -> (memref<?xin
 //  CHECK-SAME: %[[A:.*]]: index,
 //  CHECK-SAME: %[[B:.*]]: memref<?xi64>,
 //  CHECK-SAME: %[[C:.*]]: memref<?xf32>)
-//       CHECK: sparse_tensor.sort_coo insertion_sort_stable %[[A]], %[[B]] jointly %[[C]] {ny = 1 : index, perm_map = #{{.*}}}
+//       CHECK: sparse_tensor.sort insertion_sort_stable %[[A]], %[[B]] jointly %[[C]] {ny = 1 : index, perm_map = #{{.*}}}
 //       CHECK: return %[[B]], %[[C]]
 func.func @sparse_sort_coo_stable(%arg0: index, %arg1: memref<?xi64>, %arg2: memref<?xf32>) -> (memref<?xi64>, memref<?xf32>) {
-  sparse_tensor.sort_coo insertion_sort_stable %arg0, %arg1 jointly %arg2 {perm_map = #ID_MAP, ny = 1 : index}: memref<?xi64> jointly memref<?xf32>
+  sparse_tensor.sort insertion_sort_stable %arg0, %arg1 jointly %arg2 {perm_map = #ID_MAP, ny = 1 : index}: memref<?xi64> jointly memref<?xf32>
   return %arg1, %arg2 : memref<?xi64>, memref<?xf32>
 }
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
index d82c59a714d14a..c4ef50bee01ea2 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -split-input-file | mlir-opt | FileCheck %s
 
 // CHECK-LABEL: func private @sparse_1d_tensor(
-// CHECK-SAME: tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>)
+// CHECK-SAME: tensor<32xf64, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>)
 func.func private @sparse_1d_tensor(tensor<32xf64, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>)
 
 // -----
@@ -13,7 +13,7 @@ func.func private @sparse_1d_tensor(tensor<32xf64, #sparse_tensor.encoding<{ map
 }>
 
 // CHECK-LABEL: func private @sparse_csr(
-// CHECK-SAME: tensor<?x?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 64, crdWidth = 64 }>>)
+// CHECK-SAME: tensor<?x?xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 64, crdWidth = 64 }>>)
 func.func private @sparse_csr(tensor<?x?xf32, #CSR>)
 
 // -----
@@ -23,7 +23,7 @@ func.func private @sparse_csr(tensor<?x?xf32, #CSR>)
 }>
 
 // CHECK-LABEL: func private @CSR_explicit(
-// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>
 func.func private @CSR_explicit(%arg0: tensor<?x?xf64, #CSR_explicit>) {
   return
 }
@@ -37,7 +37,7 @@ func.func private @CSR_explicit(%arg0: tensor<?x?xf64, #CSR_explicit>) {
 }>
 
 // CHECK-LABEL: func private @sparse_csc(
-// CHECK-SAME: tensor<?x?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>>)
+// CHECK-SAME: tensor<?x?xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d1 : dense, d0 : compressed) }>>)
 func.func private @sparse_csc(tensor<?x?xf32, #CSC>)
 
 // -----
@@ -49,7 +49,7 @@ func.func private @sparse_csc(tensor<?x?xf32, #CSC>)
 }>
 
 // CHECK-LABEL: func private @sparse_dcsc(
-// CHECK-SAME: tensor<?x?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)>, crdWidth = 64 }>>)
+// CHECK-SAME: tensor<?x?xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d1 : compressed, d0 : compressed), crdWidth = 64 }>>)
 func.func private @sparse_dcsc(tensor<?x?xf32, #DCSC>)
 
 // -----
@@ -59,7 +59,7 @@ func.func private @sparse_dcsc(tensor<?x?xf32, #DCSC>)
 }>
 
 // CHECK-LABEL: func private @sparse_coo(
-// CHECK-SAME: tensor<?x?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed_nu_no", "singleton_no" ] }>>)
+// CHECK-SAME: tensor<?x?xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed(nonunique, nonordered), d1 : singleton(nonordered)) }>>)
 func.func private @sparse_coo(tensor<?x?xf32, #COO>)
 
 // -----
@@ -69,7 +69,7 @@ func.func private @sparse_coo(tensor<?x?xf32, #COO>)
 }>
 
 // CHECK-LABEL: func private @sparse_bcoo(
-// CHECK-SAME: tensor<?x?x?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed_hi_nu", "singleton" ] }>>)
+// CHECK-SAME: tensor<?x?x?xf32, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : dense, d1 : loose_compressed(nonunique), d2 : singleton) }>>)
 func.func private @sparse_bcoo(tensor<?x?x?xf32, #BCOO>)
 
 // -----
@@ -79,7 +79,7 @@ func.func private @sparse_bcoo(tensor<?x?x?xf32, #BCOO>)
 }>
 
 // CHECK-LABEL: func private @sparse_sorted_coo(
-// CHECK-SAME: tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed_nu", "singleton" ] }>>)
+// CHECK-SAME: tensor<10x10xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton) }>>)
 func.func private @sparse_sorted_coo(tensor<10x10xf64, #SortedCOO>)
 
 // -----
@@ -94,7 +94,7 @@ func.func private @sparse_sorted_coo(tensor<10x10xf64, #SortedCOO>)
 }>
 
 // CHECK-LABEL: func private @sparse_bcsr(
-// CHECK-SAME: tensor<10x60xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "dense", "dense" ], dimToLvl = affine_map<(d0, d1) -> (d0 floordiv 2, d1 floordiv 3, d0 mod 2, d1 mod 3)> }>>
+// CHECK-SAME: tensor<10x60xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 floordiv 2 : compressed, d1 floordiv 3 : compressed, d0 mod 2 : dense, d1 mod 3 : dense) }>>
 func.func private @sparse_bcsr(tensor<10x60xf64, #BCSR>)
 
 
@@ -105,7 +105,7 @@ func.func private @sparse_bcsr(tensor<10x60xf64, #BCSR>)
 }>
 
 // CHECK-LABEL: func private @sparse_ell(
-// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "compressed" ], dimToLvl = affine_map<(d0, d1)[s0] -> (d0 * (s0 * 4), d0, d1)> }>>
+// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ map = [s0](d0, d1) -> (d0 * (s0 * 4) : dense, d0 : dense, d1 : compressed) }>>
 func.func private @sparse_ell(tensor<?x?xf64, #ELL>)
 
 // -----
@@ -115,7 +115,7 @@ func.func private @sparse_ell(tensor<?x?xf64, #ELL>)
 }>
 
 // CHECK-LABEL: func private @sparse_slice(
-// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimSlices = [ (1, 4, 1), (1, 4, 2) ] }>>
+// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ map = (d0 : #sparse_tensor<slice(1, 4, 1)>, d1 : #sparse_tensor<slice(1, 4, 2)>) -> (d0 : dense, d1 : compressed) }>>
 func.func private @sparse_slice(tensor<?x?xf64, #CSR_SLICE>)
 
 // -----
@@ -125,7 +125,7 @@ func.func private @sparse_slice(tensor<?x?xf64, #CSR_SLICE>)
 }>
 
 // CHECK-LABEL: func private @sparse_slice(
-// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimSlices = [ (1, ?, 1), (?, 4, 2) ] }>>
+// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ map = (d0 : #sparse_tensor<slice(1, ?, 1)>, d1 : #sparse_tensor<slice(?, 4, 2)>) -> (d0 : dense, d1 : compressed) }>>
 func.func private @sparse_slice(tensor<?x?xf64, #CSR_SLICE>)
 
 // -----
@@ -138,7 +138,7 @@ func.func private @sparse_slice(tensor<?x?xf64, #CSR_SLICE>)
 }>
 
 // CHECK-LABEL: func private @sparse_2_out_of_4(
-// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed24" ] }>>
+// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : block2_4) }>>
 func.func private @sparse_2_out_of_4(tensor<?x?xf64, #NV_24>)
 
 // -----
@@ -153,7 +153,7 @@ func.func private @sparse_2_out_of_4(tensor<?x?xf64, #NV_24>)
 }>
 
 // CHECK-LABEL: func private @BCSR(
-// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "dense", "dense" ], dimToLvl = affine_map<(d0, d1) -> (d0 floordiv 2, d1 floordiv 3, d0 mod 2, d1 mod 3)> }>>
+// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 floordiv 2 : compressed, d1 floordiv 3 : compressed, d0 mod 2 : dense, d1 mod 3 : dense) }>>
 func.func private @BCSR(%arg0: tensor<?x?xf64, #BCSR>) {
   return
 }
@@ -174,7 +174,7 @@ func.func private @BCSR(%arg0: tensor<?x?xf64, #BCSR>) {
 }>
 
 // CHECK-LABEL: func private @BCSR_explicit(
-// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "dense", "dense" ], dimToLvl = affine_map<(d0, d1) -> (d0 floordiv 2, d1 floordiv 3, d0 mod 2, d1 mod 3)> }>>
+// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 floordiv 2 : compressed, d1 floordiv 3 : compressed, d0 mod 2 : dense, d1 mod 3 : dense) }>>
 func.func private @BCSR_explicit(%arg0: tensor<?x?xf64, #BCSR_explicit>) {
   return
 }
@@ -190,7 +190,7 @@ func.func private @BCSR_explicit(%arg0: tensor<?x?xf64, #BCSR_explicit>) {
 }>
 
 // CHECK-LABEL: func private @NV_24(
-// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "compressed24" ], dimToLvl = affine_map<(d0, d1) -> (d0, d1 floordiv 4, d1 mod 4)> }>>
+// CHECK-SAME: tensor<?x?xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : block2_4) }>>
 func.func private @NV_24(%arg0: tensor<?x?xf64, #NV_24>) {
   return
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
index a2431575594432..2fb4529e5695e5 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
@@ -1,456 +1,430 @@
-// RUN: mlir-opt %s --sparse-tensor-conversion --canonicalize --cse | FileCheck %s
+// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false enable-convert=false" \
+// RUN: | FileCheck %s
+// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=true enable-convert=false" \
+// RUN: | FileCheck %s
 
-#SparseMatrix = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : compressed, d1 : compressed)}>
 
-#SparseMatrix_P = #sparse_tensor.encoding<{map = (d0, d1) -> (d1 : compressed, d0 : compressed)}>
-
-#SparseMatrix_D_P = #sparse_tensor.encoding<{map = (d0, d1) -> (d1 : dense, d0 : dense)}>
-
-// CHECK-LABEL: func.func @concat_mix_dense(
-// CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<2x4xf64>,
-// CHECK-SAME:    %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
-// CHECK-DAG:     %[[TMP_c2:.*]] = arith.constant 2 : index
-// CHECK-DAG:     %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
-// CHECK-DAG:     %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
-// CHECK-DAG:     %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
-// CHECK-DAG:     %[[TMP_c8_i8:.*]] = arith.constant 8 : i8
-// CHECK-DAG:     %[[TMP_c3:.*]] = arith.constant 3 : index
-// CHECK-DAG:     %[[TMP_c1:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:     %[[TMP_c0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[TMP_c4:.*]] = arith.constant 4 : index
-// CHECK:         %[[TMP_0:.*]] = memref.alloc() : memref<5x4xf64>
-// CHECK:         linalg.fill ins(%[[TMP_cst]] : f64) outs(%[[TMP_0]] : memref<5x4xf64>)
-// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
-// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
-// CHECK:             %[[TMP_12:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<2x4xf64>
-// CHECK:             %[[TMP_13:.*]] = arith.cmpf une, %[[TMP_12]], %[[TMP_cst]] : f64
-// CHECK:             scf.if %[[TMP_13]] {
-// CHECK:               memref.store %[[TMP_12]], %[[TMP_0]][%[[TMP_arg2]], %[[TMP_arg3]]] : memref<5x4xf64>
-// CHECK:             }
-// CHECK:           }
-// CHECK:         }
-// CHECK-DAG:     %[[LvlTypes:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:     %[[LvlTypesP:.*]] = memref.cast %[[LvlTypes]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:     %[[DimSizes:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[DimSizesP:.*]] = memref.cast %[[DimSizes]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c3]], %[[DimSizes]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c4]], %[[DimSizes]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizes:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizesP:.*]] = memref.cast %[[LvlSizes]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Iota:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[IotaP:.*]] = memref.cast %[[Iota]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c0]], %[[Iota]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c1]], %[[Iota]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[DimSizesP]], %[[LvlSizesP]], %[[LvlTypesP]], %[[IotaP]], %[[IotaP]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]])
-// CHECK:         %[[TMP_8:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:         %[[TMP_9:.*]] = memref.cast %[[TMP_8]] : memref<2xindex> to memref<?xindex>
-// CHECK:         %[[TMP_10:.*]] = memref.alloca() : memref<f64>
-// CHECK:         scf.while : () -> () {
-// CHECK:           %[[TMP_12:.*]] = func.call @getNextF64(%[[TMP_7]], %[[TMP_9]], %[[TMP_10]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
-// CHECK:           scf.condition(%[[TMP_12]])
-// CHECK:         } do {
-// CHECK:           %[[TMP_12:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:           %[[TMP_13:.*]] = arith.addi %[[TMP_12]], %[[TMP_c2]] : index
-// CHECK:           %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:           %[[TMP_15:.*]] = memref.load %[[TMP_10]][] : memref<f64>
-// CHECK:           memref.store %[[TMP_15]], %[[TMP_0]][%[[TMP_13]], %[[TMP_14]]] : memref<5x4xf64>
-// CHECK:           scf.yield
-// CHECK:         }
-// CHECK:         call @delSparseTensorIteratorF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:         %[[TMP_11:.*]] = bufferization.to_tensor %[[TMP_0]] : memref<5x4xf64>
-// CHECK:         return %[[TMP_11]] : tensor<5x4xf64>
-// CHECK:       }
-func.func @concat_mix_dense(%arg0: tensor<2x4xf64>, %arg1: tensor<3x4xf64, #SparseMatrix>) -> tensor<5x4xf64> {
-  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 0 : index}
-       : tensor<2x4xf64>, tensor<3x4xf64, #SparseMatrix> to tensor<5x4xf64>
-  return %0 : tensor<5x4xf64>
-}
-
-// CHECK-LABEL: func.func @concat_mix_sparse(
-// CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<2x4xf64>,
-// CHECK-SAME:    %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
-// CHECK-DAG:     %[[TMP_c2:.*]] = arith.constant 2 : index
-// CHECK-DAG:     %[[TMP_c2_i32:.*]] = arith.constant 2 : i32
-// CHECK-DAG:     %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
-// CHECK-DAG:     %[[TMP_c3:.*]] = arith.constant 3 : index
-// CHECK-DAG:     %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:     %[[TMP_c4_i32:.*]] = arith.constant 4 : i32
-// CHECK-DAG:     %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
-// CHECK-DAG:     %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
-// CHECK-DAG:     %[[TMP_c1:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[TMP_c0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[TMP_c5:.*]] = arith.constant 5 : index
-// CHECK-DAG:     %[[TMP_c4:.*]] = arith.constant 4 : index
-// CHECK-DAG:     %[[TMP_c8_i8:.*]] = arith.constant 8 : i8
-// CHECK-DAG:     %[[LvlTypes_0:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:     %[[LvlTypesP_0:.*]] = memref.cast %[[LvlTypes_0]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_0]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_0]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:     %[[DimSizes_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[DimSizesP_0:.*]] = memref.cast %[[DimSizes_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c5]], %[[DimSizes_0]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c4]], %[[DimSizes_0]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizes_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizesP_0:.*]] = memref.cast %[[LvlSizes_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Iota_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[IotaP_0:.*]] = memref.cast %[[Iota_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c0]], %[[Iota_0]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c1]], %[[Iota_0]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[NullPtr:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
-// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[DimSizesP_0]], %[[LvlSizesP_0]], %[[LvlTypesP_0]], %[[IotaP_0]], %[[IotaP_0]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c4_i32]], %[[NullPtr]])
-// CHECK:         %[[TMP_9:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:         %[[TMP_10:.*]] = memref.cast %[[TMP_9]] : memref<2xindex> to memref<?xindex>
-// CHECK:         %[[TMP_8:.*]] = memref.alloca() : memref<f64>
-// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
-// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
-// CHECK:             memref.store %[[TMP_arg2]], %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:             memref.store %[[TMP_arg3]], %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:             %[[TMP_22:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<2x4xf64>
-// CHECK:             %[[TMP_23:.*]] = arith.cmpf une, %[[TMP_22]], %[[TMP_cst]] : f64
-// CHECK:             scf.if %[[TMP_23]] {
-// CHECK:               memref.store %[[TMP_22]], %[[TMP_8]][] : memref<f64>
-// CHECK:               %[[TMP_24:.*]] = func.call @addEltF64(%[[TMP_7]], %[[TMP_8]], %[[TMP_10]], %[[IotaP_0]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
-// CHECK:             }
-// CHECK:           }
-// CHECK:         }
-// CHECK-DAG:     %[[LvlTypes_1:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:     %[[LvlTypesP_1:.*]] = memref.cast %[[LvlTypes_1]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_1]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_1]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:     %[[DimSizes_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[DimSizesP_1:.*]] = memref.cast %[[DimSizes_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c3]], %[[DimSizes_1]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c4]], %[[DimSizes_1]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizes_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizesP_1:.*]] = memref.cast %[[LvlSizes_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Iota_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[IotaP_1:.*]] = memref.cast %[[Iota_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c0]], %[[Iota_1]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c1]], %[[Iota_1]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:         %[[TMP_17:.*]] = call @newSparseTensor(%[[DimSizesP_1]], %[[LvlSizesP_1]], %[[LvlTypesP_1]], %[[IotaP_1]], %[[IotaP_1]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]])
-// CHECK:         %[[TMP_18:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:         %[[TMP_19:.*]] = memref.cast %[[TMP_18]] : memref<2xindex> to memref<?xindex>
-// CHECK:         %[[TMP_20:.*]] = memref.alloca() : memref<f64>
-// CHECK:         scf.while : () -> () {
-// CHECK:           %[[TMP_22:.*]] = func.call @getNextF64(%[[TMP_17]], %[[TMP_19]], %[[TMP_20]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
-// CHECK:           scf.condition(%[[TMP_22]])
-// CHECK:         } do {
-// CHECK:           %[[TMP_22:.*]] = memref.load %[[TMP_18]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:           %[[TMP_23:.*]] = arith.addi %[[TMP_22]], %[[TMP_c2]] : index
-// CHECK:           %[[TMP_24:.*]] = memref.load %[[TMP_18]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:           memref.store %[[TMP_23]], %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:           memref.store %[[TMP_24]], %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:           %[[TMP_25:.*]] = func.call @addEltF64(%[[TMP_7]], %[[TMP_20]], %[[TMP_10]], %[[IotaP_0]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
-// CHECK:           scf.yield
-// CHECK:         }
-// CHECK:         call @delSparseTensorIteratorF64(%[[TMP_17]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:         %[[TMP_21:.*]] = call @newSparseTensor(%[[DimSizesP_0]], %[[LvlSizesP_0]], %[[LvlTypesP_0]], %[[IotaP_0]], %[[IotaP_0]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c2_i32]], %[[TMP_7]])
-// CHECK:         call @delSparseTensorCOOF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:         return %[[TMP_21]] : !llvm.ptr<i8>
-// CHECK:       }
-func.func @concat_mix_sparse(%arg0: tensor<2x4xf64>, %arg1: tensor<3x4xf64, #SparseMatrix>) -> tensor<5x4xf64, #SparseMatrix> {
-  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 0 : index}
-       : tensor<2x4xf64>, tensor<3x4xf64, #SparseMatrix> to tensor<5x4xf64, #SparseMatrix>
-  return %0 : tensor<5x4xf64, #SparseMatrix>
+#DCSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : compressed, d1 : compressed)}>
+#DENSE = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : dense)}>
+#DENSE_P = #sparse_tensor.encoding<{map = (d0, d1) -> (d1 : dense, d0 : dense)}>
+// CHECK-LABEL: @concat_sparse_sparse(
+//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_c0:.*]] = arith.constant 0 : index
+//       CHECK:  %[[TMP_c1:.*]] = arith.constant 1 : index
+//       CHECK:  %[[TMP_c5:.*]] = arith.constant 5 : index
+//       CHECK:  %[[TMP_c2:.*]] = arith.constant 2 : index
+//       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor() : tensor<9x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  %[[RET_1:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] iter_args(%[[A0:.*]] = %[[TMP_0]])
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    %[[RET_4:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A1:.*]] = %[[A0]])
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[NEW_1:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
+//       CHECK:      scf.yield %[[NEW_1]]
+//       CHECK:    }
+//       CHECK:    scf.yield %[[RET_4]]
+//       CHECK:  }
+//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  %[[RET_2:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] iter_args(%[[A2:.*]] = %[[RET_1]])
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    %[[RET_5:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A3:.*]] = %[[A2]])
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
+//       CHECK:      %[[NEW_2:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
+//       CHECK:      scf.yield %[[NEW_2]]
+//       CHECK:    }
+//       CHECK:    scf.yield %[[RET_5]]
+//       CHECK:  }
+//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  %[[RET_3:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] iter_args(%[[A4:.*]] = %[[RET_2]])
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    %[[RET_6:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A5:.*]] = %[[A4]])
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
+//       CHECK:      %[[NEW_3:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
+//       CHECK:      scf.yield %[[NEW_3]]
+//       CHECK:    }
+//       CHECK:    scf.yield %[[RET_6]]
+//       CHECK:  }
+//       CHECK:  %[[TMP_23:.*]] = sparse_tensor.load %[[RET_3]] hasInserts
+//       CHECK:  return %[[TMP_23]] : tensor<9x4xf64, #sparse_tensor
+func.func @concat_sparse_sparse(%arg0: tensor<2x4xf64, #DCSR>,
+                                %arg1: tensor<3x4xf64, #DCSR>,
+                                %arg2: tensor<4x4xf64, #DCSR>)
+                                -> tensor<9x4xf64, #DCSR> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #DCSR>,
+           tensor<3x4xf64, #DCSR>,
+           tensor<4x4xf64, #DCSR> to tensor<9x4xf64, #DCSR>
+    return %0 : tensor<9x4xf64, #DCSR>
 }
 
-// CHECK-LABEL: func.func @concat_mix_sparse_perm_dim1(
-// CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<4x2xf64>,
-// CHECK-SAME:    %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
-// CHECK-DAG:     %[[TMP_c2:.*]] = arith.constant 2 : index
-// CHECK-DAG:     %[[TMP_c2_i32:.*]] = arith.constant 2 : i32
-// CHECK-DAG:     %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
-// CHECK-DAG:     %[[TMP_c3:.*]] = arith.constant 3 : index
-// CHECK-DAG:     %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:     %[[TMP_c4_i32:.*]] = arith.constant 4 : i32
-// CHECK-DAG:     %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
-// CHECK-DAG:     %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
-// CHECK-DAG:     %[[TMP_c1:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[TMP_c0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[TMP_c4:.*]] = arith.constant 4 : index
-// CHECK-DAG:     %[[TMP_c5:.*]] = arith.constant 5 : index
-// CHECK-DAG:     %[[TMP_c8_i8:.*]] = arith.constant 8 : i8
-// CHECK-DAG:     %[[LvlTypes_0:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:     %[[LvlTypesP_0:.*]] = memref.cast %[[LvlTypes_0]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_0]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_0]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:     %[[DimSizes_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[DimSizesP_0:.*]] = memref.cast %[[DimSizes_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c4]], %[[DimSizes_0]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c5]], %[[DimSizes_0]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizes_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizesP_0:.*]] = memref.cast %[[LvlSizes_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Lvl2Dim_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[Lvl2DimP_0:.*]] = memref.cast %[[Lvl2Dim_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Dim2Lvl_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[Dim2LvlP_0:.*]] = memref.cast %[[Dim2Lvl_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c1]], %[[Dim2Lvl_0]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c0]], %[[Dim2Lvl_0]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[NullPtr:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
-// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[DimSizesP_0]], %[[LvlSizesP_0]], %[[LvlTypesP_0]], %[[Lvl2DimP_0]], %[[Dim2LvlP_0]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c4_i32]], %[[NullPtr]])
-// CHECK:         %[[TMP_9:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:         %[[TMP_10:.*]] = memref.cast %[[TMP_9]] : memref<2xindex> to memref<?xindex>
-// CHECK:         %[[TMP_8:.*]] = memref.alloca() : memref<f64>
-// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
-// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
-// CHECK:             memref.store %[[TMP_arg2]], %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:             memref.store %[[TMP_arg3]], %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:             %[[TMP_22:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<4x2xf64>
-// CHECK:             %[[TMP_23:.*]] = arith.cmpf une, %[[TMP_22]], %[[TMP_cst]] : f64
-// CHECK:             scf.if %[[TMP_23]] {
-// CHECK:               memref.store %[[TMP_22]], %[[TMP_8]][] : memref<f64>
-// CHECK:               %[[TMP_24:.*]] = func.call @addEltF64(%[[TMP_7]], %[[TMP_8]], %[[TMP_10]], %[[Dim2LvlP_0]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
-// CHECK:             }
-// CHECK:           }
-// CHECK:         }
-// CHECK-DAG:     %[[LvlTypes_1:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:     %[[LvlTypesP_1:.*]] = memref.cast %[[LvlTypes_1]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_1]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_1]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:     %[[DimSizes_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[DimSizesP_1:.*]] = memref.cast %[[DimSizes_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c4]], %[[DimSizes_1]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c3]], %[[DimSizes_1]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizes_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizesP_1:.*]] = memref.cast %[[LvlSizes_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Iota_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[IotaP_1:.*]] = memref.cast %[[Iota_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c0]], %[[Iota_1]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c1]], %[[Iota_1]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:         %[[TMP_17:.*]] = call @newSparseTensor(%[[DimSizesP_1]], %[[LvlSizesP_1]], %[[LvlTypesP_1]], %[[IotaP_1]], %[[IotaP_1]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]])
-// CHECK:         %[[TMP_18:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:         %[[TMP_19:.*]] = memref.cast %[[TMP_18]] : memref<2xindex> to memref<?xindex>
-// CHECK:         %[[TMP_20:.*]] = memref.alloca() : memref<f64>
-// CHECK:         scf.while : () -> () {
-// CHECK:           %[[TMP_22:.*]] = func.call @getNextF64(%[[TMP_17]], %[[TMP_19]], %[[TMP_20]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
-// CHECK:           scf.condition(%[[TMP_22]])
-// CHECK:         } do {
-// CHECK:           %[[TMP_22:.*]] = memref.load %[[TMP_18]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:           %[[TMP_23:.*]] = memref.load %[[TMP_18]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:           %[[TMP_24:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-// CHECK:           memref.store %[[TMP_22]], %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:           memref.store %[[TMP_24]], %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:           %[[TMP_25:.*]] = func.call @addEltF64(%[[TMP_7]], %[[TMP_20]], %[[TMP_10]], %[[Dim2LvlP_0]]) : (!llvm.ptr<i8>, memref<f64>, memref<?xindex>, memref<?xindex>) -> !llvm.ptr<i8>
-// CHECK:           scf.yield
-// CHECK:         }
-// CHECK:         call @delSparseTensorIteratorF64(%[[TMP_17]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:         %[[TMP_21:.*]] = call @newSparseTensor(%[[DimSizesP_0]], %[[LvlSizesP_0]], %[[LvlTypesP_0]], %[[Lvl2DimP_0]], %[[Dim2LvlP_0]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c2_i32]], %[[TMP_7]])
-// CHECK:         call @delSparseTensorCOOF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:         return %[[TMP_21]] : !llvm.ptr<i8>
-// CHECK:       }
-func.func @concat_mix_sparse_perm_dim1(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #SparseMatrix_P>) -> tensor<4x5xf64, #SparseMatrix_P> {
-  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 1 : index}
-       : tensor<4x2xf64>, tensor<4x3xf64, #SparseMatrix_P> to tensor<4x5xf64, #SparseMatrix_P>
-  return %0 : tensor<4x5xf64, #SparseMatrix_P>
+// CHECK-LABEL: @concat_sparse_sparse_dynamic(
+//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
+//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
+//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:  %[[TMP_c9:.*]] = arith.constant 9 : index
+//   CHECK-DAG:  %[[TMP_c4:.*]] = arith.constant 4 : index
+//       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor(%[[TMP_c9]], %[[TMP_c4]]) : tensor<?x?xf64, #sparse_tensor
+//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  %[[RET_1:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] iter_args(%[[A0:.*]] = %[[TMP_0]])
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    %[[RET_4:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A1:.*]] = %[[A0]])
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[NEW_1:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
+//       CHECK:      scf.yield %[[NEW_1]]
+//       CHECK:    }
+//       CHECK:    scf.yield %[[RET_4]]
+//       CHECK:  }
+//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  %[[RET_2:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] iter_args(%[[A2:.*]] = %[[RET_1]])
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    %[[RET_5:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A3:.*]] = %[[A2]])
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
+//       CHECK:      %[[NEW_2:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
+//       CHECK:      scf.yield %[[NEW_2]]
+//       CHECK:    }
+//       CHECK:    scf.yield %[[RET_5]]
+//       CHECK:  }
+//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  %[[RET_3:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] iter_args(%[[A4:.*]] = %[[RET_2]])
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    %[[RET_6:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A5:.*]] = %[[A4]])
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
+//       CHECK:      %[[NEW_3:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
+//       CHECK:      scf.yield %[[NEW_3]]
+//       CHECK:    }
+//       CHECK:    scf.yield %[[RET_6]]
+//       CHECK:  }
+//       CHECK:  %[[TMP_23:.*]] = sparse_tensor.load %[[RET_3]] hasInserts
+//       CHECK:  return %[[TMP_23]] : tensor<?x?xf64, #sparse_tensor
+func.func @concat_sparse_sparse_dynamic(%arg0: tensor<2x4xf64, #DCSR>,
+                                %arg1: tensor<3x4xf64, #DCSR>,
+                                %arg2: tensor<4x4xf64, #DCSR>)
+                                -> tensor<?x?xf64, #DCSR> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #DCSR>,
+           tensor<3x4xf64, #DCSR>,
+           tensor<4x4xf64, #DCSR> to tensor<?x?xf64, #DCSR>
+    return %0 : tensor<?x?xf64, #DCSR>
 }
 
-// CHECK-LABEL: func.func @concat_mix_dense_perm_dim1(
-// CHECK-SAME:     %[[TMP_arg0:.*]]: tensor<4x2xf64>,
-// CHECK-SAME:     %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
-// CHECK-DAG:         %[[TMP_c2:.*]] = arith.constant 2 : index
-// CHECK-DAG:         %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
-// CHECK-DAG:         %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
-// CHECK-DAG:         %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
-// CHECK-DAG:         %[[TMP_c8_i8:.*]] = arith.constant 8 : i8
-// CHECK-DAG:         %[[TMP_c3:.*]] = arith.constant 3 : index
-// CHECK-DAG:         %[[TMP_c1:.*]] = arith.constant 1 : index
-// CHECK-DAG:         %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:         %[[TMP_c0:.*]] = arith.constant 0 : index
-// CHECK-DAG:         %[[TMP_c4:.*]] = arith.constant 4 : index
-// CHECK:         %[[TMP_0:.*]] = memref.alloc() : memref<4x5xf64>
-// CHECK:         linalg.fill ins(%[[TMP_cst]] : f64) outs(%[[TMP_0]] : memref<4x5xf64>)
-// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
-// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
-// CHECK:             %[[TMP_12:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<4x2xf64>
-// CHECK:             %[[TMP_13:.*]] = arith.cmpf une, %[[TMP_12]], %[[TMP_cst]] : f64
-// CHECK:             scf.if %[[TMP_13]] {
-// CHECK:               memref.store %[[TMP_12]], %[[TMP_0]][%[[TMP_arg2]], %[[TMP_arg3]]] : memref<4x5xf64>
-// CHECK:             }
-// CHECK:           }
-// CHECK:         }
-// CHECK-DAG:     %[[LvlTypes:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:     %[[LvlTypesP:.*]] = memref.cast %[[LvlTypes]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:     %[[DimSizes:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[DimSizesP:.*]] = memref.cast %[[DimSizes]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c4]], %[[DimSizes]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c3]], %[[DimSizes]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizes:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizesP:.*]] = memref.cast %[[LvlSizes]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Iota:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[IotaP:.*]] = memref.cast %[[Iota]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c0]], %[[Iota]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c1]], %[[Iota]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[DimSizesP]], %[[LvlSizesP]], %[[LvlTypesP]], %[[IotaP]], %[[IotaP]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]])
-// CHECK:         %[[TMP_8:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:         %[[TMP_9:.*]] = memref.cast %[[TMP_8]] : memref<2xindex> to memref<?xindex>
-// CHECK:         %[[TMP_10:.*]] = memref.alloca() : memref<f64>
-// CHECK:         scf.while : () -> () {
-// CHECK:           %[[TMP_12:.*]] = func.call @getNextF64(%[[TMP_7]], %[[TMP_9]], %[[TMP_10]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
-// CHECK:           scf.condition(%[[TMP_12]])
-// CHECK:         } do {
-// CHECK:           %[[TMP_12:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:           %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:           %[[TMP_14:.*]] = arith.addi %[[TMP_13]], %[[TMP_c2]] : index
-// CHECK:           %[[TMP_15:.*]] = memref.load %[[TMP_10]][] : memref<f64>
-// CHECK:           memref.store %[[TMP_15]], %[[TMP_0]][%[[TMP_12]], %[[TMP_14]]] : memref<4x5xf64>
-// CHECK:           scf.yield
-// CHECK:         }
-// CHECK:         call @delSparseTensorIteratorF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:         %[[TMP_11:.*]] = bufferization.to_tensor %[[TMP_0]] : memref<4x5xf64>
-// CHECK:         return %[[TMP_11]] : tensor<4x5xf64>
-// CHECK:       }
-func.func @concat_mix_dense_perm_dim1(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #SparseMatrix_P>) -> tensor<4x5xf64> {
-  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 1 : index}
-       : tensor<4x2xf64>, tensor<4x3xf64, #SparseMatrix_P> to tensor<4x5xf64>
-  return %0 : tensor<4x5xf64>
+// CHECK-LABEL: @concat_sparse_sparse_dense(
+//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
+//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
+//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:  %[[TMP_c9:.*]] = arith.constant 9 : index
+//   CHECK-DAG:  %[[TMP_c4:.*]] = arith.constant 4 : index
+//   CHECK-DAG:  %[[TMP_d0:.*]] = arith.constant 0.000000e+00 : f64
+//       CHECK:  %[[A:.*]] = memref.alloc(%[[TMP_c9]], %[[TMP_c4]]) : memref<?x?xf64>
+//       CHECK:  linalg.fill ins(%[[TMP_d0]] : f64) outs(%[[A]] : memref<?x?xf64>)
+//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      memref.store %[[TMP_28]], %[[A]]{{\[}}%[[TMP_23]], %[[TMP_27]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
+//       CHECK:      memref.store %[[TMP_28]], %[[A]]{{\[}}%[[TMP_29]], %[[TMP_27]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
+//       CHECK:      memref.store %[[TMP_28]], %[[A]]{{\[}}%[[TMP_29]], %[[TMP_27]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[R:.*]] = bufferization.to_tensor %[[A]] : memref<?x?xf64>
+//       CHECK:  return %[[R]] : tensor<?x?xf64>
+func.func @concat_sparse_sparse_dense(%arg0: tensor<2x4xf64, #DCSR>,
+                                %arg1: tensor<3x4xf64, #DCSR>,
+                                %arg2: tensor<4x4xf64, #DCSR>)
+                                -> tensor<?x?xf64> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #DCSR>,
+           tensor<3x4xf64, #DCSR>,
+           tensor<4x4xf64, #DCSR> to tensor<?x?xf64>
+    return %0 : tensor<?x?xf64>
 }
 
-// CHECK-LABEL: func.func @concat_mix_dense_perm_dim1_dyn(
-// CHECK-SAME:      %[[TMP_arg0:.*]]: tensor<3x2xf64>,
-// CHECK-SAME:      %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
-// CHECK-DAG:       %[[TMP_c2:.*]] = arith.constant 2 : index
-// CHECK-DAG:       %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
-// CHECK-DAG:       %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
-// CHECK-DAG:       %[[TMP_c8_i8:.*]] = arith.constant 8 : i8
-// CHECK-DAG:       %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:       %[[TMP_c0:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[TMP_c3:.*]] = arith.constant 3 : index
-// CHECK-DAG:       %[[TMP_c1:.*]] = arith.constant 1 : index
-// CHECK:           %[[TMP_0:.*]] = memref.alloc() : memref<3x5xf64>
-// CHECK:           %[[TMP_1:.*]] = memref.cast %[[TMP_0]] : memref<3x5xf64> to memref<?x?xf64>
-// CHECK:           linalg.fill ins(%[[TMP_cst]] : f64) outs(%[[TMP_0]] : memref<3x5xf64>)
-// CHECK:           scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c3]] step %[[TMP_c1]] {
-// CHECK:             scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
-// CHECK:               %[[TMP_13:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<3x2xf64>
-// CHECK:               %[[TMP_14:.*]] = arith.cmpf une, %[[TMP_13]], %[[TMP_cst]] : f64
-// CHECK:               scf.if %[[TMP_14]] {
-// CHECK:                 memref.store %[[TMP_13]], %[[TMP_0]][%[[TMP_arg2]], %[[TMP_arg3]]] : memref<3x5xf64>
-// CHECK:               }
-// CHECK:             }
-// CHECK:           }
-// CHECK-DAG:       %[[LvlTypes:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:       %[[LvlTypesP:.*]] = memref.cast %[[LvlTypes]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:       memref.store %[[TMP_c8_i8]], %[[LvlTypes]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:       memref.store %[[TMP_c8_i8]], %[[LvlTypes]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:       %[[DimSizes:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:       %[[DimSizesP:.*]] = memref.cast %[[DimSizes]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:       memref.store %[[TMP_c3]], %[[DimSizes]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:       memref.store %[[TMP_c3]], %[[DimSizes]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:       %[[LvlSizes:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:       %[[LvlSizesP:.*]] = memref.cast %[[LvlSizes]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:       %[[Iota:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:       %[[IotaP:.*]] = memref.cast %[[Iota]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:       memref.store %[[TMP_c0]], %[[Iota]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:       memref.store %[[TMP_c1]], %[[Iota]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:           %[[TMP_8:.*]] = call @newSparseTensor(%[[DimSizesP]], %[[LvlSizesP]], %[[LvlTypesP]], %[[IotaP]], %[[IotaP]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]])
-// CHECK:           %[[TMP_9:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:           %[[TMP_10:.*]] = memref.cast %[[TMP_9]] : memref<2xindex> to memref<?xindex>
-// CHECK:           %[[TMP_11:.*]] = memref.alloca() : memref<f64>
-// CHECK:           scf.while : () -> () {
-// CHECK:             %[[TMP_13:.*]] = func.call @getNextF64(%[[TMP_8]], %[[TMP_10]], %[[TMP_11]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
-// CHECK:             scf.condition(%[[TMP_13]])
-// CHECK:           } do {
-// CHECK:             %[[TMP_13:.*]] = memref.load %[[TMP_9]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:             %[[TMP_14:.*]] = memref.load %[[TMP_9]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:             %[[TMP_15:.*]] = arith.addi %[[TMP_14]], %[[TMP_c2]] : index
-// CHECK:             %[[TMP_16:.*]] = memref.load %[[TMP_11]][] : memref<f64>
-// CHECK:             memref.store %[[TMP_16]], %[[TMP_0]][%[[TMP_13]], %[[TMP_15]]] : memref<3x5xf64>
-// CHECK:             scf.yield
-// CHECK:           }
-// CHECK:           call @delSparseTensorIteratorF64(%[[TMP_8]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:           %[[TMP_12:.*]] = bufferization.to_tensor %[[TMP_1]] : memref<?x?xf64>
-// CHECK:           return %[[TMP_12]] : tensor<?x?xf64>
-// CHECK:       }
-func.func @concat_mix_dense_perm_dim1_dyn(%arg0: tensor<3x2xf64>, %arg1: tensor<3x3xf64, #SparseMatrix>) -> tensor<?x?xf64> {
-  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 1 : index}
-       : tensor<3x2xf64>, tensor<3x3xf64, #SparseMatrix> to tensor<?x?xf64>
-  return %0 : tensor<?x?xf64>
+// CHECK-LABEL: @concat_sparse_sparse_annotated_dense(
+//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
+//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
+//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:  %[[TMP_c9:.*]] = arith.constant 9 : index
+//   CHECK-DAG:  %[[TMP_c4:.*]] = arith.constant 4 : index
+//       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor(%[[TMP_c9]], %[[TMP_c4]]) : tensor<?x?xf64, #sparse_tensor
+//       CHECK:  %[[VAL_0:.*]] = sparse_tensor.values %[[TMP_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
+//       CHECK:  %[[DIM_0:.*]] = memref.alloca() : memref<2xindex>
+//       CHECK:  memref.store %[[TMP_c9]], %[[DIM_0]][%[[TMP_c0]]] : memref<2xindex>
+//       CHECK:  memref.store %[[TMP_c4]], %[[DIM_0]][%[[TMP_c1]]] : memref<2xindex>
+//       CHECK:  %[[VAL_1:.*]] = memref.reshape %[[VAL_0]](%[[DIM_0]]) : (memref<?xf64>, memref<2xindex>) -> memref<?x?xf64>
+//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_23]], %[[TMP_27]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
+//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_29]], %[[TMP_27]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
+//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_29]], %[[TMP_27]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[R:.*]] = sparse_tensor.convert %[[TMP_0]]
+//       CHECK:  return %[[R]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @concat_sparse_sparse_annotated_dense(%arg0: tensor<2x4xf64, #DCSR>,
+                                %arg1: tensor<3x4xf64, #DCSR>,
+                                %arg2: tensor<4x4xf64, #DCSR>)
+                                -> tensor<?x?xf64, #DENSE> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #DCSR>,
+           tensor<3x4xf64, #DCSR>,
+           tensor<4x4xf64, #DCSR> to tensor<?x?xf64, #DENSE>
+    return %0 : tensor<?x?xf64, #DENSE>
 }
 
-// CHECK-LABEL: func.func @concat_annotated_dense(
-// CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<4x2xf64>,
-// CHECK-SAME:    %[[TMP_arg1:.*]]: !llvm.ptr<i8>)
-// CHECK-DAG:     %[[TMP_c2:.*]] = arith.constant 2 : index
-// CHECK-DAG:     %[[TMP_c6_i32:.*]] = arith.constant 6 : i32
-// CHECK-DAG:     %[[TMP_c4_i8:.*]] = arith.constant 4 : i8
-// CHECK-DAG:     %[[TMP_c8_i8:.*]] = arith.constant 8 : i8
-// CHECK-DAG:     %[[TMP_c3:.*]] = arith.constant 3 : index
-// CHECK-DAG:     %[[TMP_cst:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:     %[[TMP_c1_i32:.*]] = arith.constant 1 : i32
-// CHECK-DAG:     %[[TMP_c0_i32:.*]] = arith.constant 0 : i32
-// CHECK-DAG:     %[[TMP_c1:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[TMP_c0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[TMP_c4:.*]] = arith.constant 4 : index
-// CHECK-DAG:     %[[TMP_c5:.*]] = arith.constant 5 : index
-// CHECK-DAG:     %[[LvlTypes_0:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:     %[[LvlTypesP_0:.*]] = memref.cast %[[LvlTypes_0]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:     memref.store %[[TMP_c4_i8]], %[[LvlTypes_0]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:     memref.store %[[TMP_c4_i8]], %[[LvlTypes_0]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:     %[[DimSizes_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[DimSizesP_0:.*]] = memref.cast %[[DimSizes_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c4]], %[[DimSizes_0]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c5]], %[[DimSizes_0]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizes_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizesP_0:.*]] = memref.cast %[[LvlSizes_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Lvl2Dim_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[Lvl2DimP_0:.*]] = memref.cast %[[Lvl2Dim_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Dim2Lvl_0:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[Dim2LvlP_0:.*]] = memref.cast %[[Dim2Lvl_0]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c1]], %[[Dim2Lvl_0]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c0]], %[[Dim2Lvl_0]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[NullPtr:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
-// CHECK:         %[[TMP_7:.*]] = call @newSparseTensor(%[[DimSizesP_0]], %[[LvlSizesP_0]], %[[LvlTypesP_0]], %[[Lvl2DimP_0]], %[[Dim2LvlP_0]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c0_i32]], %[[NullPtr]])
-// CHECK:         %[[Values_r:.*]] = call @sparseValuesF64(%[[TMP_7]]) : (!llvm.ptr<i8>) -> memref<?xf64>
-// CHECK:         %[[Values:.*]] = memref.reshape %[[Values_r]]
-// CHECK:         scf.for %[[TMP_arg2:.*]] = %[[TMP_c0]] to %[[TMP_c4]] step %[[TMP_c1]] {
-// CHECK:           scf.for %[[TMP_arg3:.*]] = %[[TMP_c0]] to %[[TMP_c2]] step %[[TMP_c1]] {
-// CHECK:             %[[TMP_22:.*]] = tensor.extract %[[TMP_arg0]][%[[TMP_arg2]], %[[TMP_arg3]]] : tensor<4x2xf64>
-// CHECK:             %[[TMP_23:.*]] = arith.cmpf une, %[[TMP_22]], %[[TMP_cst]] : f64
-// CHECK:             scf.if %[[TMP_23]] {
-// CHECK:               memref.store %[[TMP_22]], %[[Values]][%[[TMP_arg3]], %[[TMP_arg2]]] : memref<?x?xf64>
-// CHECK:             }
-// CHECK:           }
-// CHECK:         }
-// CHECK-DAG:     %[[LvlTypes_1:.*]] = memref.alloca() : memref<2xi8>
-// CHECK-DAG:     %[[LvlTypesP_1:.*]] = memref.cast %[[LvlTypes_1]] : memref<2xi8> to memref<?xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_1]][%[[TMP_c0]]] : memref<2xi8>
-// CHECK-DAG:     memref.store %[[TMP_c8_i8]], %[[LvlTypes_1]][%[[TMP_c1]]] : memref<2xi8>
-// CHECK-DAG:     %[[DimSizes_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[DimSizesP_1:.*]] = memref.cast %[[DimSizes_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c4]], %[[DimSizes_1]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c3]], %[[DimSizes_1]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizes_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[LvlSizesP_1:.*]] = memref.cast %[[LvlSizes_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     %[[Iota_1:.*]] = memref.alloca() : memref<2xindex>
-// CHECK-DAG:     %[[IotaP_1:.*]] = memref.cast %[[Iota_1]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:     memref.store %[[TMP_c0]], %[[Iota_1]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK-DAG:     memref.store %[[TMP_c1]], %[[Iota_1]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:         %[[TMP_17:.*]] = call @newSparseTensor(%[[DimSizesP_1]], %[[LvlSizesP_1]], %[[LvlTypesP_1]], %[[IotaP_1]], %[[IotaP_1]], %[[TMP_c0_i32]], %[[TMP_c0_i32]], %[[TMP_c1_i32]], %[[TMP_c6_i32]], %[[TMP_arg1]])
-// CHECK:         %[[TMP_18:.*]] = memref.alloca() : memref<2xindex>
-// CHECK:         %[[TMP_19:.*]] = memref.cast %[[TMP_18]] : memref<2xindex> to memref<?xindex>
-// CHECK:         %[[TMP_20:.*]] = memref.alloca() : memref<f64>
-// CHECK:         scf.while : () -> () {
-// CHECK:           %[[TMP_22:.*]] = func.call @getNextF64(%[[TMP_17]], %[[TMP_19]], %[[TMP_20]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f64>) -> i1
-// CHECK:           scf.condition(%[[TMP_22]])
-// CHECK:         } do {
-// CHECK:           %[[TMP_22:.*]] = memref.load %[[TMP_18]][%[[TMP_c0]]] : memref<2xindex>
-// CHECK:           %[[TMP_23:.*]] = memref.load %[[TMP_18]][%[[TMP_c1]]] : memref<2xindex>
-// CHECK:           %[[TMP_24:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-// CHECK:           %[[TMP_25:.*]] = memref.load %[[TMP_20]][] : memref<f64>
-// CHECK:           memref.store %[[TMP_25]], %[[Values]][%[[TMP_24]], %[[TMP_22]]] : memref<?x?xf64>
-// CHECK:           scf.yield
-// CHECK:         }
-// CHECK:         call @delSparseTensorIteratorF64(%[[TMP_17]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:         return %[[TMP_7]] : !llvm.ptr<i8>
-// CHECK:       }
-func.func @concat_annotated_dense(%arg0: tensor<4x2xf64>, %arg1: tensor<4x3xf64, #SparseMatrix_P>) -> tensor<4x5xf64, #SparseMatrix_D_P> {
-  %0 = sparse_tensor.concatenate %arg0, %arg1 {dimension = 1 : index}
-       : tensor<4x2xf64>, tensor<4x3xf64, #SparseMatrix_P> to tensor<4x5xf64, #SparseMatrix_D_P>
-  return %0 : tensor<4x5xf64, #SparseMatrix_D_P>
+// CHECK-LABEL: @concat_sparse_sparse_annotated_dense_permute(
+//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
+//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
+//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
+//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:  %[[TMP_c9:.*]] = arith.constant 9 : index
+//   CHECK-DAG:  %[[TMP_c4:.*]] = arith.constant 4 : index
+//       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor(%[[TMP_c9]], %[[TMP_c4]]) : tensor<?x?xf64, #sparse_tensor
+//       CHECK:  %[[VAL_0:.*]] = sparse_tensor.values %[[TMP_0]] : tensor<?x?xf64, #sparse_tensor
+//       CHECK:  %[[DIM_0:.*]] = memref.alloca() : memref<2xindex>
+//       CHECK:  memref.store %[[TMP_c4]], %[[DIM_0]][%[[TMP_c0]]] : memref<2xindex>
+//       CHECK:  memref.store %[[TMP_c9]], %[[DIM_0]][%[[TMP_c1]]] : memref<2xindex>
+//       CHECK:  %[[VAL_1:.*]] = memref.reshape %[[VAL_0]](%[[DIM_0]]) : (memref<?xf64>, memref<2xindex>) -> memref<?x?xf64>
+//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_27]], %[[TMP_23]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
+//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_27]], %[[TMP_29]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
+//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]]
+//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
+//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
+//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
+//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_27]], %[[TMP_29]]] : memref<?x?xf64>
+//       CHECK:    }
+//       CHECK:  }
+//       CHECK:  %[[R:.*]] = sparse_tensor.convert %[[TMP_0]]
+//       CHECK:  return %[[R]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @concat_sparse_sparse_annotated_dense_permute(%arg0: tensor<2x4xf64, #DCSR>,
+                                %arg1: tensor<3x4xf64, #DCSR>,
+                                %arg2: tensor<4x4xf64, #DCSR>)
+                                -> tensor<?x?xf64, #DENSE_P> {
+    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
+         : tensor<2x4xf64, #DCSR>,
+           tensor<3x4xf64, #DCSR>,
+           tensor<4x4xf64, #DCSR> to tensor<?x?xf64, #DENSE_P>
+    return %0 : tensor<?x?xf64, #DENSE_P>
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
deleted file mode 100644
index 6f9c45842ffb2e..00000000000000
--- a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
+++ /dev/null
@@ -1,427 +0,0 @@
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false enable-convert=false" \
-// RUN: | FileCheck %s
-
-#DCSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : compressed, d1 : compressed)}>
-#DENSE = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : dense)}>
-#DENSE_P = #sparse_tensor.encoding<{map = (d0, d1) -> (d1 : dense, d0 : dense)}>
-// CHECK-LABEL: @concat_sparse_sparse(
-//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_c0:.*]] = arith.constant 0 : index
-//       CHECK:  %[[TMP_c1:.*]] = arith.constant 1 : index
-//       CHECK:  %[[TMP_c5:.*]] = arith.constant 5 : index
-//       CHECK:  %[[TMP_c2:.*]] = arith.constant 2 : index
-//       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor() : tensor<9x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_1:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] iter_args(%[[A0:.*]] = %[[TMP_0]])
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_4:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A1:.*]] = %[[A0]])
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[NEW_1:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_1]]
-//       CHECK:    }
-//       CHECK:    scf.yield %[[RET_4]]
-//       CHECK:  }
-//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_2:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] iter_args(%[[A2:.*]] = %[[RET_1]])
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_5:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A3:.*]] = %[[A2]])
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-//       CHECK:      %[[NEW_2:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_2]]
-//       CHECK:    }
-//       CHECK:    scf.yield %[[RET_5]]
-//       CHECK:  }
-//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_3:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] iter_args(%[[A4:.*]] = %[[RET_2]])
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_6:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A5:.*]] = %[[A4]])
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
-//       CHECK:      %[[NEW_3:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_3]]
-//       CHECK:    }
-//       CHECK:    scf.yield %[[RET_6]]
-//       CHECK:  }
-//       CHECK:  %[[TMP_23:.*]] = sparse_tensor.load %[[RET_3]] hasInserts
-//       CHECK:  return %[[TMP_23]] : tensor<9x4xf64, #sparse_tensor
-func.func @concat_sparse_sparse(%arg0: tensor<2x4xf64, #DCSR>,
-                                %arg1: tensor<3x4xf64, #DCSR>,
-                                %arg2: tensor<4x4xf64, #DCSR>)
-                                -> tensor<9x4xf64, #DCSR> {
-    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
-         : tensor<2x4xf64, #DCSR>,
-           tensor<3x4xf64, #DCSR>,
-           tensor<4x4xf64, #DCSR> to tensor<9x4xf64, #DCSR>
-    return %0 : tensor<9x4xf64, #DCSR>
-}
-
-// CHECK-LABEL: @concat_sparse_sparse_dynamic(
-//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
-//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
-//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
-//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
-//   CHECK-DAG:  %[[TMP_c9:.*]] = arith.constant 9 : index
-//   CHECK-DAG:  %[[TMP_c4:.*]] = arith.constant 4 : index
-//       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor(%[[TMP_c9]], %[[TMP_c4]]) : tensor<?x?xf64, #sparse_tensor
-//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_1:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] iter_args(%[[A0:.*]] = %[[TMP_0]])
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_4:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A1:.*]] = %[[A0]])
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[NEW_1:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_1]]
-//       CHECK:    }
-//       CHECK:    scf.yield %[[RET_4]]
-//       CHECK:  }
-//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_2:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] iter_args(%[[A2:.*]] = %[[RET_1]])
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_5:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A3:.*]] = %[[A2]])
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-//       CHECK:      %[[NEW_2:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_2]]
-//       CHECK:    }
-//       CHECK:    scf.yield %[[RET_5]]
-//       CHECK:  }
-//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_3:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] iter_args(%[[A4:.*]] = %[[RET_2]])
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_6:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A5:.*]] = %[[A4]])
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
-//       CHECK:      %[[NEW_3:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_3]]
-//       CHECK:    }
-//       CHECK:    scf.yield %[[RET_6]]
-//       CHECK:  }
-//       CHECK:  %[[TMP_23:.*]] = sparse_tensor.load %[[RET_3]] hasInserts
-//       CHECK:  return %[[TMP_23]] : tensor<?x?xf64, #sparse_tensor
-func.func @concat_sparse_sparse_dynamic(%arg0: tensor<2x4xf64, #DCSR>,
-                                %arg1: tensor<3x4xf64, #DCSR>,
-                                %arg2: tensor<4x4xf64, #DCSR>)
-                                -> tensor<?x?xf64, #DCSR> {
-    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
-         : tensor<2x4xf64, #DCSR>,
-           tensor<3x4xf64, #DCSR>,
-           tensor<4x4xf64, #DCSR> to tensor<?x?xf64, #DCSR>
-    return %0 : tensor<?x?xf64, #DCSR>
-}
-
-// CHECK-LABEL: @concat_sparse_sparse_dense(
-//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
-//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
-//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
-//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
-//   CHECK-DAG:  %[[TMP_c9:.*]] = arith.constant 9 : index
-//   CHECK-DAG:  %[[TMP_c4:.*]] = arith.constant 4 : index
-//   CHECK-DAG:  %[[TMP_d0:.*]] = arith.constant 0.000000e+00 : f64
-//       CHECK:  %[[A:.*]] = memref.alloc(%[[TMP_c9]], %[[TMP_c4]]) : memref<?x?xf64>
-//       CHECK:  linalg.fill ins(%[[TMP_d0]] : f64) outs(%[[A]] : memref<?x?xf64>)
-//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      memref.store %[[TMP_28]], %[[A]]{{\[}}%[[TMP_23]], %[[TMP_27]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-//       CHECK:      memref.store %[[TMP_28]], %[[A]]{{\[}}%[[TMP_29]], %[[TMP_27]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
-//       CHECK:      memref.store %[[TMP_28]], %[[A]]{{\[}}%[[TMP_29]], %[[TMP_27]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[R:.*]] = bufferization.to_tensor %[[A]] : memref<?x?xf64>
-//       CHECK:  return %[[R]] : tensor<?x?xf64>
-func.func @concat_sparse_sparse_dense(%arg0: tensor<2x4xf64, #DCSR>,
-                                %arg1: tensor<3x4xf64, #DCSR>,
-                                %arg2: tensor<4x4xf64, #DCSR>)
-                                -> tensor<?x?xf64> {
-    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
-         : tensor<2x4xf64, #DCSR>,
-           tensor<3x4xf64, #DCSR>,
-           tensor<4x4xf64, #DCSR> to tensor<?x?xf64>
-    return %0 : tensor<?x?xf64>
-}
-
-// CHECK-LABEL: @concat_sparse_sparse_annotated_dense(
-//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
-//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
-//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
-//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
-//   CHECK-DAG:  %[[TMP_c9:.*]] = arith.constant 9 : index
-//   CHECK-DAG:  %[[TMP_c4:.*]] = arith.constant 4 : index
-//       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor(%[[TMP_c9]], %[[TMP_c4]]) : tensor<?x?xf64, #sparse_tensor
-//       CHECK:  %[[VAL_0:.*]] = sparse_tensor.values %[[TMP_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
-//       CHECK:  %[[DIM_0:.*]] = memref.alloca() : memref<2xindex>
-//       CHECK:  memref.store %[[TMP_c9]], %[[DIM_0]][%[[TMP_c0]]] : memref<2xindex>
-//       CHECK:  memref.store %[[TMP_c4]], %[[DIM_0]][%[[TMP_c1]]] : memref<2xindex>
-//       CHECK:  %[[VAL_1:.*]] = memref.reshape %[[VAL_0]](%[[DIM_0]]) : (memref<?xf64>, memref<2xindex>) -> memref<?x?xf64>
-//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_23]], %[[TMP_27]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_29]], %[[TMP_27]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
-//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_29]], %[[TMP_27]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[R:.*]] = sparse_tensor.convert %[[TMP_0]]
-//       CHECK:  return %[[R]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
-func.func @concat_sparse_sparse_annotated_dense(%arg0: tensor<2x4xf64, #DCSR>,
-                                %arg1: tensor<3x4xf64, #DCSR>,
-                                %arg2: tensor<4x4xf64, #DCSR>)
-                                -> tensor<?x?xf64, #DENSE> {
-    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
-         : tensor<2x4xf64, #DCSR>,
-           tensor<3x4xf64, #DCSR>,
-           tensor<4x4xf64, #DCSR> to tensor<?x?xf64, #DENSE>
-    return %0 : tensor<?x?xf64, #DENSE>
-}
-
-// CHECK-LABEL: @concat_sparse_sparse_annotated_dense_permute(
-//  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse_tensor
-//  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse_tensor
-//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
-//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
-//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
-//   CHECK-DAG:  %[[TMP_c9:.*]] = arith.constant 9 : index
-//   CHECK-DAG:  %[[TMP_c4:.*]] = arith.constant 4 : index
-//       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor(%[[TMP_c9]], %[[TMP_c4]]) : tensor<?x?xf64, #sparse_tensor
-//       CHECK:  %[[VAL_0:.*]] = sparse_tensor.values %[[TMP_0]] : tensor<?x?xf64, #sparse_tensor
-//       CHECK:  %[[DIM_0:.*]] = memref.alloca() : memref<2xindex>
-//       CHECK:  memref.store %[[TMP_c4]], %[[DIM_0]][%[[TMP_c0]]] : memref<2xindex>
-//       CHECK:  memref.store %[[TMP_c9]], %[[DIM_0]][%[[TMP_c1]]] : memref<2xindex>
-//       CHECK:  %[[VAL_1:.*]] = memref.reshape %[[VAL_0]](%[[DIM_0]]) : (memref<?xf64>, memref<2xindex>) -> memref<?x?xf64>
-//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_4:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_27]], %[[TMP_23]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[TMP_8:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_9:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_10:.*]] = sparse_tensor.positions %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_11:.*]] = sparse_tensor.coordinates %[[TMP_arg1]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
-//   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_27]], %[[TMP_29]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[TMP_15:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_16:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 0 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_17:.*]] = sparse_tensor.positions %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_18:.*]] = sparse_tensor.coordinates %[[TMP_arg2]] {level = 1 : index} : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
-//       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
-//       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]]
-//       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]]
-//       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
-//       CHECK:      memref.store %[[TMP_28]], %[[VAL_1]][%[[TMP_27]], %[[TMP_29]]] : memref<?x?xf64>
-//       CHECK:    }
-//       CHECK:  }
-//       CHECK:  %[[R:.*]] = sparse_tensor.convert %[[TMP_0]]
-//       CHECK:  return %[[R]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
-func.func @concat_sparse_sparse_annotated_dense_permute(%arg0: tensor<2x4xf64, #DCSR>,
-                                %arg1: tensor<3x4xf64, #DCSR>,
-                                %arg2: tensor<4x4xf64, #DCSR>)
-                                -> tensor<?x?xf64, #DENSE_P> {
-    %0 = sparse_tensor.concatenate %arg0, %arg1, %arg2 {dimension = 0 : index}
-         : tensor<2x4xf64, #DCSR>,
-           tensor<3x4xf64, #DCSR>,
-           tensor<4x4xf64, #DCSR> to tensor<?x?xf64, #DENSE_P>
-    return %0 : tensor<?x?xf64, #DENSE_P>
-}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
index 4fbd36906eacf0..12d443ca679207 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
@@ -116,7 +116,7 @@
 // CHECK:               } {"Emitted from" = "linalg.generic"}
 // CHECK:               scf.yield %[[VAL_64:.*]] : index
 // CHECK:             } {"Emitted from" = "linalg.generic"}
-// CHECK:             sparse_tensor.sort_coo  hybrid_quick_sort %[[VAL_65:.*]], %[[VAL_33]]
+// CHECK:             sparse_tensor.sort  hybrid_quick_sort %[[VAL_65:.*]], %[[VAL_33]]
 // CHECK:             %[[VAL_66:.*]]:4 = scf.for %[[VAL_67:.*]] = %[[VAL_10]] to %[[VAL_65]] step %[[VAL_11]] iter_args(%[[VAL_68:.*]] = %[[VAL_36]], %[[VAL_69:.*]] = %[[VAL_37]], %[[VAL_70:.*]] = %[[VAL_38]], %[[VAL_71:.*]] = %[[VAL_39]]) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
 // CHECK:               %[[VAL_72:.*]] = memref.load %[[VAL_32]]{{\[}}%[[VAL_67]]] : memref<4xindex>
 // CHECK:               %[[VAL_73:.*]] = memref.load %[[VAL_30]]{{\[}}%[[VAL_72]]] : memref<4xf64>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
index 5ecc92231068c1..543fcaededf496 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt %s | mlir-opt | FileCheck %s --check-prefix=CHECK-ROUND
-// RUN: mlir-opt %s --sparse-tensor-conversion --cse --canonicalize | FileCheck %s --check-prefix=CHECK-CONV
+// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=true enable-convert=false" \
+// RUN: --cse --canonicalize  | FileCheck %s
 // RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false enable-convert=false" \
-// RUN: --cse --canonicalize  | FileCheck %s --check-prefix=CHECK-RWT
+// RUN: --cse --canonicalize  | FileCheck %s
 
 #SparseVector = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 #SparseMatrix = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
@@ -14,55 +15,28 @@
 //      CHECK-ROUND:  %[[E:.*]] = tensor.expand_shape %[[A]] {{\[\[}}0, 1]] : tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>> into tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //      CHECK-ROUND:  return %[[E]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //
-// conversion:
-//
-// CHECK-CONV-LABEL: func.func @sparse_expand(
-// CHECK-CONV-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-CONV-DAG:  %[[C1:.*]] = arith.constant 1 : index
-// CHECK-CONV-DAG:  %[[C10:.*]] = arith.constant 10 : index
-// CHECK-CONV-DAG:  call @newSparseTensor
-// CHECK-CONV-DAG:  call @newSparseTensor
-// CHECK-CONV:      scf.while : () -> () {
-// CHECK-CONV:        call @getNextF64
-// CHECK-CONV:        scf.condition
-// CHECK-CONV:      } do {
-// CHECK-CONV:        %[[X:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<1xindex>
-// CHECK-CONV:        %[[D:.*]] = arith.divui %[[X]], %[[C10]] : index
-// CHECK-CONV:        %[[R:.*]] = arith.remui %[[X]], %[[C10]] : index
-// CHECK-CONV:        memref.store %[[D]], %{{.*}}[%[[C0]]] : memref<2xindex>
-// CHECK-CONV:        memref.store %[[R]], %{{.*}}[%[[C1]]] : memref<2xindex>
-// CHECK-CONV:        call @addEltF64
-// CHECK-CONV:        scf.yield
-// CHECK-CONV:      }
-// CHECK-CONV:      %[[N:.*]] = call @newSparseTensor
-// CHECK-CONV:      call @delSparseTensorCOOF64
-// CHECK-CONV:      call @delSparseTensorIteratorF64
-// CHECK-CONV:      return %[[N]] : !llvm.ptr<i8>
-//
-// rewrite for codegen:
-//
-// CHECK-RWT-LABEL:   func.func @sparse_expand(
-// CHECK-RWT-SAME:    %[[S:.*]]:
-// CHECK-RWT-DAG:     %[[C10:.*]] = arith.constant 10 : index
-// CHECK-RWT-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-RWT-DAG:     %[[C1:.*]] = arith.constant 1 : index
-// CHECK-RWT:         %[[B:.*]] = bufferization.alloc_tensor()
-// CHECK-RWT:         %[[P0:.*]] = sparse_tensor.positions %[[S]] {level = 0 : index}
-// CHECK-RWT:         %[[I0:.*]] = sparse_tensor.coordinates %[[S]] {level = 0 : index}
-// CHECK-RWT:         %[[V:.*]] = sparse_tensor.values %[[S]]
-// CHECK-RWT:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
-// CHECK-RWT:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK-RWT:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R:.*]] = %[[B]])
-// CHECK-RWT:           %[[SI:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT:           %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[I]]] : memref<?xf64>
-// CHECK-RWT:           %[[DI0:.*]] = arith.divui %[[SI]], %[[C10]] : index
-// CHECK-RWT:           %[[DI1:.*]] = arith.remui %[[SI]], %[[C10]] : index
-// CHECK-RWT:           %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R]]{{\[}}%[[DI0]], %[[DI1]]]
-// CHECK-RWT:           scf.yield %[[NT:.*]]
-// CHECK-RWT:         }
-// CHECK-RWT:         %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
-// CHECK-RWT-NOT:     sparse_tensor.convert
-// CHECK-RWT:         return %[[NT1]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-LABEL:   func.func @sparse_expand(
+// CHECK-SAME:    %[[S:.*0]]:
+// CHECK-DAG:     %[[C10:.*]] = arith.constant 10 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
+// CHECK:         %[[B:.*]] = bufferization.alloc_tensor()
+// CHECK:         %[[P0:.*]] = sparse_tensor.positions %[[S]] {level = 0 : index}
+// CHECK:         %[[I0:.*]] = sparse_tensor.coordinates %[[S]] {level = 0 : index}
+// CHECK:         %[[V:.*]] = sparse_tensor.values %[[S]]
+// CHECK:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
+// CHECK:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
+// CHECK:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R:.*]] = %[[B]])
+// CHECK:           %[[SI:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK:           %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[I]]] : memref<?xf64>
+// CHECK:           %[[DI0:.*]] = arith.divui %[[SI]], %[[C10]] : index
+// CHECK:           %[[DI1:.*]] = arith.remui %[[SI]], %[[C10]] : index
+// CHECK:           %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R]]{{\[}}%[[DI0]], %[[DI1]]]
+// CHECK:           scf.yield %[[NT:.*]]
+// CHECK:         }
+// CHECK:         %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
+// CHECK-NOT:     sparse_tensor.convert
+// CHECK:         return %[[NT1]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //
 func.func @sparse_expand(%arg0: tensor<100xf64, #SparseVector>) -> tensor<10x10xf64, #SparseMatrix> {
   %0 = tensor.expand_shape %arg0 [[0, 1]] :
@@ -78,64 +52,37 @@ func.func @sparse_expand(%arg0: tensor<100xf64, #SparseVector>) -> tensor<10x10x
 //      CHECK-ROUND:  %[[C:.*]] = tensor.collapse_shape %[[A]] {{\[\[}}0, 1]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>> into tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //      CHECK-ROUND:  return %[[C]] : tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //
-// conversion:
-//
-// CHECK-CONV-LABEL: func.func @sparse_collapse(
-// CHECK-CONV-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-CONV-DAG:  %[[C1:.*]] = arith.constant 1 : index
-// CHECK-CONV-DAG:  %[[C10:.*]] = arith.constant 10 : index
-// CHECK-CONV-DAG:  call @newSparseTensor
-// CHECK-CONV-DAG:  call @newSparseTensor
-// CHECK-CONV:      scf.while : () -> () {
-// CHECK-CONV:        call @getNextF64
-// CHECK-CONV:        scf.condition
-// CHECK-CONV:      } do {
-// CHECK-CONV:        %[[X:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<2xindex>
-// CHECK-CONV:        %[[Y:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<2xindex>
-// CHECK-CONV:        %[[M:.*]] = arith.muli %[[X]], %[[C10]] : index
-// CHECK-CONV:        %[[A:.*]] = arith.addi %[[M]], %[[Y]] : index
-// CHECK-CONV:        memref.store %[[A]], %{{.*}}[%[[C0]]] : memref<1xindex>
-// CHECK-CONV:        call @addEltF64
-// CHECK-CONV:        scf.yield
-// CHECK-CONV:      }
-// CHECK-CONV:      %[[N:.*]] = call @newSparseTensor
-// CHECK-CONV:      call @delSparseTensorCOOF64
-// CHECK-CONV:      call @delSparseTensorIteratorF64
-// CHECK-CONV:      return %[[N]] : !llvm.ptr<i8>
-//
-// rewrite for codegen:
-//
-// CHECK-RWT-LABEL:   func.func @sparse_collapse(
-// CHECK-RWT-SAME:    %[[S:.*]]:
-// CHECK-RWT-DAG:     %[[C10:.*]] = arith.constant 10 : index
-// CHECK-RWT-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-RWT-DAG:     %[[C1:.*]] = arith.constant 1 : index
-// CHECK-RWT:         %[[B:.*]] = bufferization.alloc_tensor()
-// CHECK-RWT:         %[[P0:.*]] = sparse_tensor.positions %[[S]] {level = 0 : index}
-// CHECK-RWT:         %[[I0:.*]] = sparse_tensor.coordinates %[[S]] {level = 0 : index}
-// CHECK-RWT:         %[[P1:.*]] = sparse_tensor.positions %[[S]] {level = 1 : index}
-// CHECK-RWT:         %[[I1:.*]] = sparse_tensor.coordinates %[[S]] {level = 1 : index}
-// CHECK-RWT:         %[[V:.*]] = sparse_tensor.values %[[S]]
-// CHECK-RWT:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
-// CHECK-RWT:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK-RWT:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[A0:.*]] = %[[B]])
-// CHECK-RWT:           %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT-DAG:       %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT-DAG:       %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
-// CHECK-RWT:           %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
-// CHECK-RWT:           %[[RET_1:.*]] = scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] iter_args(%[[A1:.*]] = %[[A0]])
-// CHECK-RWT:             %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
-// CHECK-RWT:             %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[J]]] : memref<?xf64>
-// CHECK-RWT:             %[[T:.*]] = arith.muli %[[SI0]], %[[C10]] : index
-// CHECK-RWT:             %[[DI:.*]] = arith.addi %[[T]], %[[SI1]] : index
-// CHECK-RWT:             %[[R1:.*]] = sparse_tensor.insert %[[SV]] into %[[A1]]{{\[}}%[[DI]]]
-// CHECK-RWT              scf.yield %[[R1]]
-// CHECK-RWT            }
-// CHECK-RWT            scf.yield %[[RET_1]]
-// CHECK-RWT:         }
-// CHECK-RWT:        %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
-// CHECK-RWT-NOT:    sparse_tensor.convert
-// CHECK-RWT:        return %[[NT1]] : tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-LABEL:   func.func @sparse_collapse(
+// CHECK-SAME:    %[[S:.*0]]:
+// CHECK-DAG:     %[[C10:.*]] = arith.constant 10 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
+// CHECK:         %[[B:.*]] = bufferization.alloc_tensor()
+// CHECK:         %[[P0:.*]] = sparse_tensor.positions %[[S]] {level = 0 : index}
+// CHECK:         %[[I0:.*]] = sparse_tensor.coordinates %[[S]] {level = 0 : index}
+// CHECK:         %[[P1:.*]] = sparse_tensor.positions %[[S]] {level = 1 : index}
+// CHECK:         %[[I1:.*]] = sparse_tensor.coordinates %[[S]] {level = 1 : index}
+// CHECK:         %[[V:.*]] = sparse_tensor.values %[[S]]
+// CHECK:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
+// CHECK:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
+// CHECK:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[A0:.*]] = %[[B]])
+// CHECK:           %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-DAG:       %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-DAG:       %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
+// CHECK:           %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
+// CHECK:           %[[RET_1:.*]] = scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] iter_args(%[[A1:.*]] = %[[A0]])
+// CHECK:             %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
+// CHECK:             %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[J]]] : memref<?xf64>
+// CHECK:             %[[T:.*]] = arith.muli %[[SI0]], %[[C10]] : index
+// CHECK:             %[[DI:.*]] = arith.addi %[[T]], %[[SI1]] : index
+// CHECK:             %[[R1:.*]] = sparse_tensor.insert %[[SV]] into %[[A1]]{{\[}}%[[DI]]]
+// CHECK              scf.yield %[[R1]]
+// CHECK            }
+// CHECK            scf.yield %[[RET_1]]
+// CHECK:         }
+// CHECK:        %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
+// CHECK-NOT:    sparse_tensor.convert
+// CHECK:        return %[[NT1]] : tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //
 func.func @sparse_collapse(%arg0: tensor<10x10xf64, #SparseMatrix>) -> tensor<100xf64, #SparseVector> {
   %0 = tensor.collapse_shape %arg0 [[0, 1]] :
@@ -151,66 +98,34 @@ func.func @sparse_collapse(%arg0: tensor<10x10xf64, #SparseMatrix>) -> tensor<10
 //      CHECK-ROUND:  %[[E:.*]] = tensor.expand_shape %[[A]] {{\[\[}}0, 1]] : tensor<?xf64, #sparse_tensor.encoding<{{{.*}}}>> into tensor<?x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //      CHECK-ROUND:  return %[[E]] : tensor<?x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //
-// conversion:
-//
-// CHECK-CONV-LABEL: func.func @dynamic_sparse_expand(
-// CHECK-CONV-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-CONV-DAG:  %[[C1:.*]] = arith.constant 1 : index
-// CHECK-CONV-DAG:  %[[C10:.*]] = arith.constant 10 : index
-// CHECK-CONV-DAG:  %[[D1:.*]] = arith.divui %{{.*}}, %[[C10]] : index
-// CHECK-CONV-DAG:  call @newSparseTensor
-// CHECK-CONV-DAG:  call @newSparseTensor
-// CHECK-CONV:      scf.while : () -> () {
-// CHECK-CONV:        call @getNextF64
-// CHECK-CONV:        scf.condition
-// CHECK-CONV:      } do {
-// CHECK-CONV:        %[[L:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<1xindex>
-// CHECK-CONV:        %[[M:.*]] = arith.muli %[[D1]], %[[C10]] : index
-// CHECK-CONV:        %[[D2:.*]] = arith.divui %[[M]], %[[D1]] : index
-// CHECK-CONV:        %[[D3:.*]] = arith.divui %[[L]], %[[D2]] : index
-// CHECK-CONV:        %[[R:.*]] = arith.remui %[[L]], %[[D2]] : index
-// CHECK-CONV:        %[[D4:.*]] = arith.divui %[[D2]], %[[C10]] : index
-// CHECK-CONV:        %[[D5:.*]] = arith.divui %[[R]], %[[D4]] : index
-// CHECK-CONV:        memref.store %[[D3]], %{{.*}}[%[[C0]]] : memref<2xindex>
-// CHECK-CONV:        memref.store %[[D5]], %{{.*}}[%[[C1]]] : memref<2xindex>
-// CHECK-CONV:        call @addEltF64
-// CHECK-CONV:        scf.yield
-// CHECK-CONV:      }
-// CHECK-CONV:      %[[N:.*]] = call @newSparseTensor
-// CHECK-CONV:      call @delSparseTensorCOOF64
-// CHECK-CONV:      call @delSparseTensorIteratorF64
-// CHECK-CONV:      return %[[N]] : !llvm.ptr<i8>
-//
-// rewrite for codegen:
-//
-// CHECK-RWT-LABEL:   func.func @dynamic_sparse_expand(
-// CHECK-RWT-SAME:    %[[S:.*]]:
-// CHECK-RWT-DAG:     %[[C10:.*]] = arith.constant 10 : index
-// CHECK-RWT-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-RWT-DAG:     %[[C1:.*]] = arith.constant 1 : index
-// CHECK-RWT:         %[[SD:.*]] = tensor.dim %[[S]], %[[C0]]
-// CHECK-RWT:         %[[DD0:.*]] = arith.divui %[[SD]], %[[C10]] : index
-// CHECK-RWT:         %[[B:.*]] = bufferization.alloc_tensor(%[[DD0]])
-// CHECK-RWT:         %[[P0:.*]] = sparse_tensor.positions %[[S]] {level = 0 : index}
-// CHECK-RWT:         %[[I0:.*]] = sparse_tensor.coordinates %[[S]] {level = 0 : index}
-// CHECK-RWT:         %[[V:.*]] = sparse_tensor.values %[[S]]
-// CHECK-RWT:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
-// CHECK-RWT:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK-RWT:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R:.*]] = %[[B]])
-// CHECK-RWT:           %[[SI:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT:           %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[I]]] : memref<?xf64>
-// CHECK-RWT:           %[[T1:.*]] = arith.muli %[[DD0]], %[[C10]] : index
-// CHECK-RWT:           %[[T2:.*]] = arith.divui %[[T1]], %[[DD0]] : index
-// CHECK-RWT:           %[[DI0:.*]] = arith.divui %[[SI]], %[[T2]] : index
-// CHECK-RWT:           %[[T3:.*]] = arith.remui %[[SI]], %[[T2]] : index
-// CHECK-RWT:           %[[T4:.*]] = arith.divui %[[T2]], %[[C10]] : index
-// CHECK-RWT:           %[[DI1:.*]] = arith.divui %[[T3]], %[[T4]] : index
-// CHECK-RWT:           %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R]]{{\[}}%[[DI0]], %[[DI1]]]
-// CHECK-RWT:           scf.yield %[[NT]]
-// CHECK-RWT:         }
-// CHECK-RWT:         %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
-// CHECK-RWT-NOT:     sparse_tensor.convert
-// CHECK-RWT:         return %[[NT1]] : tensor<?x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-LABEL:   func.func @dynamic_sparse_expand(
+// CHECK-SAME:    %[[S:.*0]]:
+// CHECK-DAG:     %[[C10:.*]] = arith.constant 10 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
+// CHECK:         %[[SD:.*]] = tensor.dim %[[S]], %[[C0]]
+// CHECK:         %[[DD0:.*]] = arith.divui %[[SD]], %[[C10]] : index
+// CHECK:         %[[B:.*]] = bufferization.alloc_tensor(%[[DD0]])
+// CHECK:         %[[P0:.*]] = sparse_tensor.positions %[[S]] {level = 0 : index}
+// CHECK:         %[[I0:.*]] = sparse_tensor.coordinates %[[S]] {level = 0 : index}
+// CHECK:         %[[V:.*]] = sparse_tensor.values %[[S]]
+// CHECK:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
+// CHECK:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
+// CHECK:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R:.*]] = %[[B]])
+// CHECK:           %[[SI:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK:           %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[I]]] : memref<?xf64>
+// CHECK:           %[[T1:.*]] = arith.muli %[[DD0]], %[[C10]] : index
+// CHECK:           %[[T2:.*]] = arith.divui %[[T1]], %[[DD0]] : index
+// CHECK:           %[[DI0:.*]] = arith.divui %[[SI]], %[[T2]] : index
+// CHECK:           %[[T3:.*]] = arith.remui %[[SI]], %[[T2]] : index
+// CHECK:           %[[T4:.*]] = arith.divui %[[T2]], %[[C10]] : index
+// CHECK:           %[[DI1:.*]] = arith.divui %[[T3]], %[[T4]] : index
+// CHECK:           %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R]]{{\[}}%[[DI0]], %[[DI1]]]
+// CHECK:           scf.yield %[[NT]]
+// CHECK:         }
+// CHECK:         %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
+// CHECK-NOT:     sparse_tensor.convert
+// CHECK:         return %[[NT1]] : tensor<?x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //
 func.func @dynamic_sparse_expand(%arg0: tensor<?xf64, #SparseVector>) -> tensor<?x10xf64, #SparseMatrix> {
   %0 = tensor.expand_shape %arg0 [[0, 1]] :
@@ -226,73 +141,42 @@ func.func @dynamic_sparse_expand(%arg0: tensor<?xf64, #SparseVector>) -> tensor<
 //      CHECK-ROUND:  %[[C:.*]] = tensor.collapse_shape %[[A]] {{\[\[}}0, 1]] : tensor<10x?xf64, #sparse_tensor.encoding<{{{.*}}}>> into tensor<?xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //      CHECK-ROUND:  return %[[C]] : tensor<?xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //
-// conversion:
-//
-// CHECK-CONV-LABEL: func.func @dynamic_sparse_collapse(
-// CHECK-CONV-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-CONV-DAG:  %[[C1:.*]] = arith.constant 1 : index
-// CHECK-CONV-DAG:  %[[C10:.*]] = arith.constant 10 : index
-// CHECK-CONV-DAG:  %[[M1:.*]] = arith.muli %{{.*}}, %[[C10]] : index
-// CHECK-CONV-DAG:  call @newSparseTensor
-// CHECK-CONV-DAG:  call @newSparseTensor
-// CHECK-CONV:      scf.while : () -> () {
-// CHECK-CONV:        call @getNextF64
-// CHECK-CONV:        scf.condition
-// CHECK-CONV:      } do {
-// CHECK-CONV:        %[[X:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<2xindex>
-// CHECK-CONV:        %[[Y:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<2xindex>
-// CHECK-CONV:        %[[D1:.*]] = arith.divui %[[M1]], %[[C10]] : index
-// CHECK-CONV:        %[[M2:.*]] = arith.muli %[[X]], %[[D1]] : index
-// CHECK-CONV:        %[[D2:.*]] = arith.divui %[[D1]], %{{.*}} : index
-// CHECK-CONV:        %[[M3:.*]] = arith.muli %[[Y]], %[[D2]] : index
-// CHECK-CONV:        %[[A:.*]] = arith.addi %[[M2]], %[[M3]] : index
-// CHECK-CONV:        memref.store %[[A]], %{{.*}}[%[[C0]]] : memref<1xindex>
-// CHECK-CONV:        call @addEltF64
-// CHECK-CONV:        scf.yield
-// CHECK-CONV:      }
-// CHECK-CONV:      %[[N:.*]] = call @newSparseTensor
-// CHECK-CONV:      call @delSparseTensorCOOF64
-// CHECK-CONV:      call @delSparseTensorIteratorF64
-// CHECK-CONV:      return %[[N]] : !llvm.ptr<i8>
-//
-// rewrite for codegen:
-//
-// CHECK-RWT-LABEL:   func.func @dynamic_sparse_collapse(
-// CHECK-RWT-SAME:    %[[S:.*]]:
-// CHECK-RWT-DAG:     %[[C10:.*]] = arith.constant 10 : index
-// CHECK-RWT-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-RWT-DAG:     %[[C1:.*]] = arith.constant 1 : index
-// CHECK-RWT:         %[[SD1:.*]] = tensor.dim %[[S]], %[[C1]]
-// CHECK-RWT:         %[[DD0:.*]] = arith.muli %[[SD1]], %[[C10]] : index
-// CHECK-RWT:         %[[B:.*]] = bufferization.alloc_tensor(%[[DD0]])
-// CHECK-RWT:         %[[P0:.*]] = sparse_tensor.positions %[[S]] {level = 0 : index}
-// CHECK-RWT:         %[[I0:.*]] = sparse_tensor.coordinates %[[S]] {level = 0 : index}
-// CHECK-RWT:         %[[P1:.*]] = sparse_tensor.positions %[[S]] {level = 1 : index}
-// CHECK-RWT:         %[[I1:.*]] = sparse_tensor.coordinates %[[S]] {level = 1 : index}
-// CHECK-RWT:         %[[V:.*]] = sparse_tensor.values %[[S]]
-// CHECK-RWT:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
-// CHECK-RWT:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK-RWT:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R0:.*]] = %[[B]])
-// CHECK-RWT:           %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT-DAG:       %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT-DAG:       %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
-// CHECK-RWT:           %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
-// CHECK-RWT:           %[[RET_1:.*]] = scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] iter_args(%[[R1:.*]] = %[[R0]])
-// CHECK-RWT:             %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
-// CHECK-RWT:             %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[J]]] : memref<?xf64>
-// CHECK-RWT:             %[[T1:.*]] = arith.divui %[[DD0]], %[[C10]] : index
-// CHECK-RWT:             %[[T2:.*]] = arith.muli %[[SI0]], %[[T1]] : index
-// CHECK-RWT:             %[[T3:.*]] = arith.divui %[[T1]], %[[SD1]] : index
-// CHECK-RWT:             %[[T4:.*]] = arith.muli %[[SI1]], %[[T3]] : index
-// CHECK-RWT:             %[[DI:.*]] = arith.addi %[[T2]], %[[T4]] : index
-// CHECK-RWT:             %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R1]]{{\[}}%[[DI]]]
-// CHECK-RWT              scf.yield %[[NT]]
-// CHECK-RWT            }
-// CHECK-RWT            scf.yield %[[RET_1]]
-// CHECK-RWT:        }
-// CHECK-RWT:        %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
-// CHECK-RWT-NOT:    sparse_tensor.convert
-// CHECK-RWT:        return %[[NT1]] : tensor<?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-LABEL:   func.func @dynamic_sparse_collapse(
+// CHECK-SAME:    %[[S:.*0]]:
+// CHECK-DAG:     %[[C10:.*]] = arith.constant 10 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
+// CHECK:         %[[SD1:.*]] = tensor.dim %[[S]], %[[C1]]
+// CHECK:         %[[DD0:.*]] = arith.muli %[[SD1]], %[[C10]] : index
+// CHECK:         %[[B:.*]] = bufferization.alloc_tensor(%[[DD0]])
+// CHECK:         %[[P0:.*]] = sparse_tensor.positions %[[S]] {level = 0 : index}
+// CHECK:         %[[I0:.*]] = sparse_tensor.coordinates %[[S]] {level = 0 : index}
+// CHECK:         %[[P1:.*]] = sparse_tensor.positions %[[S]] {level = 1 : index}
+// CHECK:         %[[I1:.*]] = sparse_tensor.coordinates %[[S]] {level = 1 : index}
+// CHECK:         %[[V:.*]] = sparse_tensor.values %[[S]]
+// CHECK:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
+// CHECK:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
+// CHECK:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R0:.*]] = %[[B]])
+// CHECK:           %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-DAG:       %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-DAG:       %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
+// CHECK:           %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
+// CHECK:           %[[RET_1:.*]] = scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] iter_args(%[[R1:.*]] = %[[R0]])
+// CHECK:             %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
+// CHECK:             %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[J]]] : memref<?xf64>
+// CHECK:             %[[T1:.*]] = arith.divui %[[DD0]], %[[C10]] : index
+// CHECK:             %[[T2:.*]] = arith.muli %[[SI0]], %[[T1]] : index
+// CHECK:             %[[T3:.*]] = arith.divui %[[T1]], %[[SD1]] : index
+// CHECK:             %[[T4:.*]] = arith.muli %[[SI1]], %[[T3]] : index
+// CHECK:             %[[DI:.*]] = arith.addi %[[T2]], %[[T4]] : index
+// CHECK:             %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R1]]{{\[}}%[[DI]]]
+// CHECK              scf.yield %[[NT]]
+// CHECK            }
+// CHECK            scf.yield %[[RET_1]]
+// CHECK:        }
+// CHECK:        %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
+// CHECK-NOT:    sparse_tensor.convert
+// CHECK:        return %[[NT1]] : tensor<?xf64, #sparse_tensor.encoding<{{{.*}}}>>
 //
 func.func @dynamic_sparse_collapse(%arg0: tensor<10x?xf64, #SparseMatrix>) -> tensor<?xf64, #SparseVector> {
   %0 = tensor.collapse_shape %arg0 [[0, 1]] :
diff --git a/mlir/test/Dialect/SparseTensor/sparse_tensor_reshape.mlir b/mlir/test/Dialect/SparseTensor/sparse_tensor_reshape.mlir
index 9368cc71c5faa4..a1578eb20b8ba3 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_tensor_reshape.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_tensor_reshape.mlir
@@ -4,7 +4,7 @@
 #SparseMatrix = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
 
 // CHECK:         func.func @sparse_reshape(
-// CHECK-SAME:    %[[S:.*]]:
+// CHECK-SAME:    %[[S:.*0]]:
 // CHECK-DAG:     %[[C25:.*]] = arith.constant 25 : index
 // CHECK-DAG:     %[[C10:.*]] = arith.constant 10 : index
 // CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir
index c7b16315bfed1b..a8b3c6af9ae893 100644
--- a/mlir/test/Dialect/Tensor/bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -tensor-bufferize -cse -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s -tensor-bufferize -cse -split-input-file | FileCheck %s
 
 // CHECK-LABEL:   func @dim(
 // CHECK-SAME:              %[[TENSOR:.*]]: tensor<*xf32>,
@@ -62,9 +62,12 @@ func.func @tensor.cast_to_unranked(%arg0: tensor<2xf32>) -> tensor<*xf32> {
 }
 
 // -----
+
+// CHECK-LABEL:   func @tensor.empty(
+// CHECK:           %[[ALLOC:.*]] = memref.alloc() {{.*}} : memref<5xf32>
+// CHECK:           %[[RET:.*]] = bufferization.to_tensor %[[ALLOC]] : memref<5xf32>
+// CHECK:           return %[[RET]] : tensor<5xf32>
 func.func @tensor.empty() -> tensor<5xf32> {
-  // expected-error@+2 {{failed to bufferize op}}
-  // expected-error@+1 {{cannot be bufferized, but can be converted to bufferization.alloc_tensor}}
   %0 = tensor.empty() : tensor<5xf32>
   return %0 : tensor<5xf32>
 }
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
index 9052744a1d3f98..38c3bb8af8107d 100644
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -384,20 +384,45 @@ func.func @tensor.reshape() -> tensor<2x2x5xf32> {
 // -----
 
 // CHECK-LABEL: @reshape_with_non_identity_layout(
-// CHECK-SAME:    %[[INPUT:[a-zA-Z0-9]*]]: memref<2x2xf32, strided<[?, ?], offset: ?>>,
-// CHECK-SAME:    %[[LAYOUT:[a-zA-Z0-9]*]]: memref<2xi32, strided<[?], offset: ?>>)
-func.func @reshape_with_non_identity_layout(%arg0: tensor<2x2xf32>, %arg1: tensor<2xi32>) -> tensor<1x2xf32> {
-
-  // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[INPUT]][1, 0] [1, 2] [1, 1] : memref<2x2xf32, strided<[?, ?], offset: ?>> to memref<2xf32, strided<[?], offset: ?>>
-  %extracted_slice = tensor.extract_slice %arg0[1, 0] [1, 2] [1, 1] : tensor<2x2xf32> to tensor<2xf32>
+// CHECK-SAME:    %[[INPUT:[a-zA-Z0-9]*]]: memref<2x2xf32, strided<[?, ?], offset: ?>, 3>,
+// CHECK-SAME:    %[[LAYOUT:[a-zA-Z0-9]*]]: memref<2xi32, strided<[?], offset: ?>>,
+func.func @reshape_with_non_identity_layout(%arg0: memref<2x2xf32, strided<[?, ?], offset: ?>, 3>, %arg1: tensor<2xi32>, %idx: index) -> f32 {
+  %t = bufferization.to_tensor %arg0 restrict : memref<2x2xf32, strided<[?, ?], offset: ?>, 3>
+
+  // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[INPUT]][1, 0] [1, 2] [1, 1] : memref<2x2xf32, strided<[?, ?], offset: ?>, 3> to memref<2xf32, strided<[?], offset: ?>, 3>
+  %extracted_slice = tensor.extract_slice %t[1, 0] [1, 2] [1, 1] : tensor<2x2xf32> to tensor<2xf32>
+
+  // To satisify the constraints of memref.reshape, the subview must be
+  // reallocated a buffer with an identity layout.
+  // CHECK: %[[ALLOC:.+]] = memref.alloc() {{.*}} : memref<2xf32, 3>
+  // CHECK: memref.copy %[[SUBVIEW]], %[[ALLOC]]
+  // CHECK: %[[RESHAPED:.+]] = memref.reshape %[[ALLOC]](%[[LAYOUT]]) : (memref<2xf32, 3>, memref<2xi32, strided<[?], offset: ?>>) -> memref<1x2xf32, 3>
+  %reshape = tensor.reshape %extracted_slice(%arg1) : (tensor<2xf32>, tensor<2xi32>) -> tensor<1x2xf32>
 
-  // To satisify the constraints of memref.reshape, the subview must be cloned into
-  // a buffer with an identity layout.
-  // CHECK: %[[CLONED:.+]] = bufferization.clone %[[SUBVIEW]] : memref<2xf32, strided<[?], offset: ?>> to memref<2xf32>
-  // CHECK: %[[RESHAPED:.+]] = memref.reshape %[[CLONED]](%[[LAYOUT]]) : (memref<2xf32>, memref<2xi32, strided<[?], offset: ?>>) -> memref<1x2xf32>
+  %r = tensor.extract %reshape[%idx, %idx] : tensor<1x2xf32>
+  return %r : f32
+}
 
-  %reshape = tensor.reshape %extracted_slice(%arg1) : (tensor<2xf32>, tensor<2xi32>) -> tensor<1x2xf32>
+// -----
 
-  // CHECK: return %[[RESHAPED]] : memref<1x2xf32>
-  return %reshape : tensor<1x2xf32>
+// CHECK-LABEL: func @collapse_shape_regression(
+//  CHECK-SAME:     %[[t:.*]]: memref<10x20xf32,
+func.func @collapse_shape_regression(
+    %t: tensor<10x20xf32>, %f: f32, %idx: index) {
+  // CHECK: %[[subview:.*]] = memref.subview %[[t]]
+  %0 = tensor.extract_slice %t [2, 3] [5, 6] [1, 1]
+      : tensor<10x20xf32> to tensor<5x6xf32>
+
+  // Insert a copy because the original %0 is read later.
+  // CHECK: %[[alloc1:.*]] = memref.alloc() {{.*}} : memref<5x6xf32>
+  // CHECK: memref.copy %[[subview]], %[[alloc1]]
+  // CHECK: memref.store {{.*}}, %[[alloc1]]
+  tensor.insert %f into %0[%idx, %idx] : tensor<5x6xf32>
+
+  // Insert a copy because the shape cannot be collapsed.
+  // CHECK: %[[alloc2:.*]] = memref.alloc() {{.*}} : memref<5x6xf32>
+  // CHECK: memref.copy %[[subview]], %[[alloc2]]
+  // CHECK: memref.collapse_shape %[[alloc2]]
+  tensor.collapse_shape %0[[0, 1]] : tensor<5x6xf32> into tensor<30xf32>
+  return
 }
diff --git a/mlir/test/Dialect/Tosa/broadcast.mlir b/mlir/test/Dialect/Tosa/broadcast.mlir
index 5dfd6433f5e373..7613aa3b8dd03d 100644
--- a/mlir/test/Dialect/Tosa/broadcast.mlir
+++ b/mlir/test/Dialect/Tosa/broadcast.mlir
@@ -174,7 +174,7 @@ func.func @test_broadcast20(%arg0: tensor<3x3x4x1xf32>, %arg1: tensor<4x5xf32>)
 func.func @test_broadcast_mul(%arg0: tensor<15x14xi32>, %arg1: tensor<17x16x15x14xi32>) -> tensor<17x16x15x14xi32> {
   // CHECK-DAG: %[[VAR0:.*]] = tosa.reshape %arg0 {new_shape = array<i64: 1, 1, 15, 14>}
   // CHECK: %[[VAR1:.*]] = tosa.mul %[[VAR0]], %arg1
-  %0 = tosa.mul %arg0, %arg1 {shift = 1 : i32 } : (tensor<15x14xi32>, tensor<17x16x15x14xi32>) -> tensor<17x16x15x14xi32>
+  %0 = tosa.mul %arg0, %arg1 {shift = 1 : i8 } : (tensor<15x14xi32>, tensor<17x16x15x14xi32>) -> tensor<17x16x15x14xi32>
   return %0 : tensor<17x16x15x14xi32>
 }
 
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 5ed5a383c6f6fe..323864ea901304 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -205,7 +205,7 @@ func.func @mul_one_float(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: return %arg0
   // CHECK-NOT: tosa.mul
   %ones = "tosa.const"() {value = dense<1.0> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-  %1 = tosa.mul %arg0, %ones {shift = 0 : i32} : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  %1 = tosa.mul %arg0, %ones {shift = 0 : i8} : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
   return %1 : tensor<2x3xf32>
 }
 
@@ -214,7 +214,7 @@ func.func @mul_bcast_one_float(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: return %arg0
   // CHECK-NOT: tosa.mul
   %ones = "tosa.const"() {value = dense<1.0> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
-  %1 = tosa.mul %ones, %arg0 {shift = 0 : i32} : (tensor<1x1xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  %1 = tosa.mul %ones, %arg0 {shift = 0 : i8} : (tensor<1x1xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
   return %1 : tensor<2x3xf32>
 }
 
@@ -223,7 +223,7 @@ func.func @mul_one_int(%arg0: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // CHECK: return %arg0
   // CHECK-NOT: tosa.mul
   %ones = "tosa.const"() {value = dense<1> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
-  %1 = tosa.mul %arg0, %ones {shift = 0 : i32} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %1 = tosa.mul %arg0, %ones {shift = 0 : i8} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %1 : tensor<2x3xi32>
 }
 
@@ -232,11 +232,11 @@ func.func @mul_zero_broadcast(%arg0: tensor<2x3xf32>) -> (tensor<2x3xf32>, tenso
   // CHECK: %[[ZERO:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<2x3xf32>}
   // CHECK-NOT: tosa.mul
   %zeros = "tosa.const"() {value = dense<0.0> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
-  %1 = tosa.mul %arg0, %zeros {shift = 0 : i32} : (tensor<2x3xf32>, tensor<1x1xf32>) -> tensor<2x3xf32>
+  %1 = tosa.mul %arg0, %zeros {shift = 0 : i8} : (tensor<2x3xf32>, tensor<1x1xf32>) -> tensor<2x3xf32>
 
   // CHECK-NOT: tosa.mul
   // CHECK: return %[[ZERO]], %[[ZERO]]
-  %2 = tosa.mul %zeros, %arg0 {shift = 0 : i32} : (tensor<1x1xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  %2 = tosa.mul %zeros, %arg0 {shift = 0 : i8} : (tensor<1x1xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
   return %1, %2 : tensor<2x3xf32>, tensor<2x3xf32>
 }
 
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index e66082d83cb907..56619fbc560e5f 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -213,7 +213,7 @@ func.func @fold_div_splat_i32() -> tensor<i32> {
 func.func @fold_mul_zero_rhs_f32(%arg0: tensor<f32>) -> tensor<f32> {
   %zero = "tosa.const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00>
-  %mul = tosa.mul %arg0, %zero {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %mul = tosa.mul %arg0, %zero {shift = 0 : i8} : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: return %[[ZERO]]
   return %mul : tensor<f32>
 }
@@ -224,7 +224,7 @@ func.func @fold_mul_zero_rhs_f32(%arg0: tensor<f32>) -> tensor<f32> {
 func.func @fold_mul_zero_lhs_f32(%arg0: tensor<f32>) -> tensor<f32> {
   %zero = "tosa.const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00>
-  %mul = tosa.mul %zero, %arg0 {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %mul = tosa.mul %zero, %arg0 {shift = 0 : i8} : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: return %[[ZERO]]
   return %mul : tensor<f32>
 }
@@ -235,7 +235,7 @@ func.func @fold_mul_zero_lhs_f32(%arg0: tensor<f32>) -> tensor<f32> {
 func.func @fold_mul_zero_rhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
   %zero = "tosa.const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0>
-  %mul = tosa.mul %arg0, %zero {shift = 0 : i32} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %mul = tosa.mul %arg0, %zero {shift = 0 : i8} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: return %[[ZERO]]
   return %mul : tensor<i32>
 }
@@ -246,7 +246,7 @@ func.func @fold_mul_zero_rhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
 func.func @fold_mul_zero_lhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
   %zero = "tosa.const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0>
-  %mul = tosa.mul %zero, %arg0 {shift = 0 : i32} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %mul = tosa.mul %zero, %arg0 {shift = 0 : i8} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: return %[[ZERO]]
   return %mul : tensor<i32>
 }
@@ -256,7 +256,7 @@ func.func @fold_mul_zero_lhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
 // CHECK-LABEL: @fold_mul_one_rhs_f32
 func.func @fold_mul_one_rhs_f32(%arg0: tensor<f32>) -> tensor<f32> {
   %one = "tosa.const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
-  %mul = tosa.mul %arg0, %one {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %mul = tosa.mul %arg0, %one {shift = 0 : i8} : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: return %arg0
   return %mul : tensor<f32>
 }
@@ -266,7 +266,7 @@ func.func @fold_mul_one_rhs_f32(%arg0: tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @fold_mul_one_lhs_f32
 func.func @fold_mul_one_lhs_f32(%arg0: tensor<f32>) -> tensor<f32> {
   %one = "tosa.const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
-  %mul = tosa.mul %one, %arg0 {shift = 0 : i32} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %mul = tosa.mul %one, %arg0 {shift = 0 : i8} : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: return %arg0
   return %mul : tensor<f32>
 }
@@ -276,7 +276,7 @@ func.func @fold_mul_one_lhs_f32(%arg0: tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @fold_mul_one_rhs_i32
 func.func @fold_mul_one_rhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
   %one = "tosa.const"() {value = dense<64> : tensor<i32>} : () -> tensor<i32>
-  %mul = tosa.mul %arg0, %one {shift = 6 : i32} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %mul = tosa.mul %arg0, %one {shift = 6 : i8} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: return %arg0
   return %mul : tensor<i32>
 }
@@ -286,7 +286,7 @@ func.func @fold_mul_one_rhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
 // CHECK-LABEL: @fold_mul_one_lhs_i32
 func.func @fold_mul_one_lhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
   %one = "tosa.const"() {value = dense<64> : tensor<i32>} : () -> tensor<i32>
-  %mul = tosa.mul %one, %arg0 {shift = 6 : i32} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %mul = tosa.mul %one, %arg0 {shift = 6 : i8} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: return %arg0
   return %mul : tensor<i32>
 }
@@ -297,7 +297,7 @@ func.func @fold_mul_one_lhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
 func.func @fold_mul_splat_i8() -> tensor<10xi32> {
   %one = "tosa.const"() {value = dense<17> : tensor<10xi8>} : () -> tensor<10xi8>
   %two = "tosa.const"() {value = dense<32> : tensor<10xi8>} : () -> tensor<10xi8>
-  %mul = tosa.mul %one, %two {shift = 3 : i32} : (tensor<10xi8>, tensor<10xi8>) -> tensor<10xi32>
+  %mul = tosa.mul %one, %two {shift = 3 : i8} : (tensor<10xi8>, tensor<10xi8>) -> tensor<10xi32>
   // CHECK: %[[THREE:.+]] = "tosa.const"() <{value = dense<68> : tensor<10xi32>}
   // CHECK: return %[[THREE]]
   return %mul : tensor<10xi32>
@@ -309,7 +309,7 @@ func.func @fold_mul_splat_i8() -> tensor<10xi32> {
 func.func @fold_mul_splat_f32() -> tensor<10xf32> {
   %one = "tosa.const"() {value = dense<3.0> : tensor<10xf32>} : () -> tensor<10xf32>
   %two = "tosa.const"() {value = dense<2.0> : tensor<10xf32>} : () -> tensor<10xf32>
-  %mul = tosa.mul %one, %two {shift = 0 : i32} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+  %mul = tosa.mul %one, %two {shift = 0 : i8} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
   // CHECK: %[[THREE:.+]] = "tosa.const"() <{value = dense<6.000000e+00> : tensor<10xf32>}
   // CHECK: return %[[THREE]]
   return %mul : tensor<10xf32>
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 754843969ef8ef..7d7f2d31a4244c 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -224,14 +224,14 @@ func.func @test_min(%arg0: tensor<13x21x3xf32>, %arg1: tensor<1x21x3xf32>) -> te
 // -----
 // CHECK-LABEL: mul
 func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> {
-  %0 = tosa.mul %arg0, %arg1 {shift = 1 : i32} : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xf32>
+  %0 = tosa.mul %arg0, %arg1 {shift = 1 : i8} : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
 
 // -----
 // CHECK-LABEL: mul
 func.func @test_mul_relaxed_result_type(%arg0: tensor<13x21x3xi16>, %arg1: tensor<13x1x3xi16>) -> tensor<13x21x3xi16> {
-  %0 = "tosa.mul"(%arg0, %arg1)  { shift = 1 : i32 } : (tensor<13x21x3xi16>, tensor<13x1x3xi16>) -> tensor<13x21x3xi16>
+  %0 = "tosa.mul"(%arg0, %arg1)  { shift = 1 : i8 } : (tensor<13x21x3xi16>, tensor<13x1x3xi16>) -> tensor<13x21x3xi16>
   return %0 : tensor<13x21x3xi16>
 }
 
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
index b3aed8ae84033e..c86bf5d056f85e 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
@@ -34,7 +34,7 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<
   // CHECK: %[[sIn:.+]] = tosa.sub %[[cIn]], %[[iZp]]
   // CHECK: %[[sWe:.+]] = tosa.sub %[[cWe]], %[[wZp]]
   // CHECK: %[[resWe:.+]] = tosa.reshape %[[sWe]] {new_shape = array<i64: 1, 1, 1, 2, 3>}
-  // CHECK: %[[mul:.+]] = tosa.mul %[[sIn]], %[[resWe]] {shift = 0 : i32}
+  // CHECK: %[[mul:.+]] = tosa.mul %[[sIn]], %[[resWe]] {shift = 0 : i8}
   // CHECK: %[[reO:.+]] = tosa.reshape %[[mul]] {new_shape = array<i64: 4, 10, 10, 6>}
   // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 6>}
   // CHECK: %[[add:.+]] = tosa.add %[[reO]], %[[reArg2]]
@@ -51,7 +51,7 @@ func.func @depthwise_conv2d_as_mul_padded(%arg0: tensor<4x10x10x2xf32>, %arg1: t
   // CHECK: %[[reIn:.+]] = tosa.reshape %arg0 {new_shape = array<i64: 4, 10, 10, 2, 1>}
   // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, tensor<5x2xi64>, tensor<f32>) -> tensor<4x12x12x2x1xf32>
   // CHECK: %[[reArg1:.+]] = tosa.reshape %arg1 {new_shape = array<i64: 1, 1, 1, 2, 3>}
-  // CHECK: %[[mul:.+]] = tosa.mul %3, %[[reArg1]] {shift = 0 : i32}
+  // CHECK: %[[mul:.+]] = tosa.mul %3, %[[reArg1]] {shift = 0 : i8}
   // CHECK: %[[reOut:.+]] = tosa.reshape %[[mul]] {new_shape = array<i64: 4, 12, 12, 6>}
   // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 6>}
   // CHECK: %[[add:.+]] = tosa.add %[[reOut]], %[[reArg2]]
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index d468ba582483cb..1ce4defcf4a6e6 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -114,8 +114,8 @@ func.func @test_binary_scalar_f32(%arg0 : tensor<4xf32>, %arg1 : tensor<f32>) ->
   // CHECK: tosa.minimum %arg0, %arg1 : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   %2 = tosa.minimum %arg0, %arg1 : (tensor<4xf32>, tensor<f32>) -> tensor<*xf32>
 
-  // CHECK: tosa.mul %arg0, %arg1 {shift = 0 : i32} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
-  %3 = tosa.mul %arg0, %arg1 { shift = 0 : i32 } : (tensor<4xf32>, tensor<f32>) -> tensor<*xf32>
+  // CHECK: tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  %3 = tosa.mul %arg0, %arg1 { shift = 0 : i8 } : (tensor<4xf32>, tensor<f32>) -> tensor<*xf32>
 
   // CHECK: tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   %4 = tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<f32>) -> tensor<*xf32>
@@ -148,8 +148,8 @@ func.func @test_binary_broadcast_f32(%arg0 : tensor<4xf32>, %arg1 : tensor<1xf32
   // CHECK: tosa.minimum %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32>
   %2 = tosa.minimum %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32>
 
-  // CHECK: tosa.mul %arg0, %arg1 {shift = 0 : i32} : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32>
-  %3 = tosa.mul %arg0, %arg1 { shift = 0 : i32 } : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32>
+  // CHECK: tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32>
+  %3 = tosa.mul %arg0, %arg1 { shift = 0 : i8 } : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32>
 
   // CHECK: tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32>
   %4 = tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32>
@@ -206,8 +206,8 @@ func.func @test_binary_i32(%arg0 : tensor<4xi32>, %arg1 : tensor<i32>) -> () {
   // CHECK: tosa.minimum %arg0, %arg1 : (tensor<4xi32>, tensor<i32>) -> tensor<4xi32>
   %10 = tosa.minimum %arg0, %arg1 : (tensor<4xi32>, tensor<i32>) -> tensor<*xi32>
 
-  // CHECK: tosa.mul %arg0, %arg1 {shift = 0 : i32} : (tensor<4xi32>, tensor<i32>) -> tensor<4xi32>
-  %11 = tosa.mul %arg0, %arg1 { shift = 0 : i32 }: (tensor<4xi32>, tensor<i32>) -> tensor<*xi32>
+  // CHECK: tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<4xi32>, tensor<i32>) -> tensor<4xi32>
+  %11 = tosa.mul %arg0, %arg1 { shift = 0 : i8 }: (tensor<4xi32>, tensor<i32>) -> tensor<*xi32>
 
   // CHECK: tosa.pow %arg0, %arg1 : (tensor<4xi32>, tensor<i32>) -> tensor<4xi32>
   %12 = tosa.pow %arg0, %arg1 : (tensor<4xi32>, tensor<i32>) -> tensor<*xi32>
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-and-schedule.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-and-schedule.mlir
index 3d4cb077698293..dd8d141e994da0 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-and-schedule.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-and-schedule.mlir
@@ -11,4 +11,10 @@
 
 // expected-remark @below {{message}}
 // expected-remark @below {{unannotated}}
+// expected-remark @below {{internal colliding (without suffix)}}
+// expected-remark @below {{internal colliding_0}}
+// expected-remark @below {{internal colliding_1}}
+// expected-remark @below {{internal colliding_3}}
+// expected-remark @below {{internal colliding_4}}
+// expected-remark @below {{internal colliding_5}}
 module {}
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-invalid.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-invalid.mlir
index b21abbbdfd6d04..7452deb39b6c18 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-invalid.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-invalid.mlir
@@ -1,16 +1,16 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-file-name=%p/test-interpreter-external-symbol-def.mlir}, test-transform-dialect-interpreter)" \
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-file-name=%p/test-interpreter-external-symbol-def-invalid.mlir}, test-transform-dialect-interpreter)" \
 // RUN:             --verify-diagnostics --split-input-file
 
-// The definition of the @foo named sequence is provided in another file. It
+// The definition of the @print_message named sequence is provided in another file. It
 // will be included because of the pass option.
 
 module attributes {transform.with_named_sequence} {
   // expected-error @below {{external definition has a mismatching signature}}
-  transform.named_sequence private @foo(!transform.op<"builtin.module"> {transform.readonly})
+  transform.named_sequence private @print_message(!transform.op<"builtin.module"> {transform.readonly})
 
   transform.sequence failures(propagate) {
   ^bb0(%arg0: !transform.op<"builtin.module">):
-    include @foo failures(propagate) (%arg0) : (!transform.op<"builtin.module">) -> ()
+    include @print_message failures(propagate) (%arg0) : (!transform.op<"builtin.module">) -> ()
   }
 }
 
@@ -37,3 +37,18 @@ module attributes {transform.with_named_sequence} {
     include @consuming failures(suppress) (%arg0) : (!transform.any_op) -> ()
   }
 }
+
+// -----
+
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{doubly defined symbol @print_message}}
+  transform.named_sequence @print_message(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "message" : !transform.any_op
+    transform.yield
+  }
+
+  transform.sequence failures(suppress) {
+  ^bb0(%arg0: !transform.any_op):
+    include @print_message failures(propagate) (%arg0) : (!transform.any_op) -> ()
+  }
+}
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl.mlir
index 04b6c5a02e0adf..7d0837abebde32 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl.mlir
@@ -4,29 +4,68 @@
 // RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-file-name=%p/test-interpreter-external-symbol-def.mlir}, test-transform-dialect-interpreter)" \
 // RUN:             --verify-diagnostics --split-input-file | FileCheck %s
 
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-file-name=%p/test-interpreter-external-symbol-def.mlir}, test-transform-dialect-interpreter{transform-library-file-name=%p/test-interpreter-external-symbol-def.mlir})" \
-// RUN:             --verify-diagnostics --split-input-file | FileCheck %s
-
-// The definition of the @foo named sequence is provided in another file. It
-// will be included because of the pass option. Repeated application of the
-// same pass, with or without the library option, should not be a problem.
+// The definition of the @print_message named sequence is provided in another
+// file. It will be included because of the pass option. Subsequent application
+// of the same pass works but only without the library file (since the first
+// application loads external symbols and loading them again woul make them
+// clash).
 // Note that the same diagnostic produced twice at the same location only
 // needs to be matched once.
 
 // expected-remark @below {{message}}
 // expected-remark @below {{unannotated}}
+// expected-remark @below {{internal colliding (without suffix)}}
+// expected-remark @below {{internal colliding_0}}
+// expected-remark @below {{internal colliding_1}}
+// expected-remark @below {{internal colliding_3}}
+// expected-remark @below {{internal colliding_4}}
+// expected-remark @below {{internal colliding_5}}
 module attributes {transform.with_named_sequence} {
-  // CHECK: transform.named_sequence @foo
-  // CHECK: test_print_remark_at_operand %{{.*}}, "message"
-  transform.named_sequence private @foo(!transform.any_op {transform.readonly})
+  // CHECK-DAG: transform.named_sequence @print_message(
+  // CHECK-DAG: transform.include @private_helper
+  transform.named_sequence private @print_message(!transform.any_op {transform.readonly})
+
+  // These ops collide with ops from the other module before or after renaming.
+  transform.named_sequence private @colliding(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "internal colliding (without suffix)" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence private @colliding_0(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "internal colliding_0" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence private @colliding_1(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "internal colliding_1" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence private @colliding_3(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "internal colliding_3" : !transform.any_op
+    transform.yield
+  }
+  // This symbol is public and thus can't be renamed.
+  // CHECK-DAG: transform.named_sequence @colliding_4(
+  transform.named_sequence @colliding_4(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "internal colliding_4" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence private @colliding_5(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "internal colliding_5" : !transform.any_op
+    transform.yield
+  }
 
-  // CHECK: transform.named_sequence @unannotated
-  // CHECK: test_print_remark_at_operand %{{.*}}, "unannotated"
-  transform.named_sequence private @unannotated(!transform.any_op {transform.readonly})
+  // CHECK-DAG: transform.named_sequence @unannotated(
+  // CHECK-DAG: test_print_remark_at_operand %{{.*}}, "unannotated"
+  transform.named_sequence @unannotated(!transform.any_op {transform.readonly})
 
   transform.sequence failures(propagate) {
   ^bb0(%arg0: !transform.any_op):
-    include @foo failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    include @print_message failures(propagate) (%arg0) : (!transform.any_op) -> ()
     include @unannotated failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    include @colliding failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    include @colliding_0 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    include @colliding_1 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    include @colliding_3 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    include @colliding_4 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    include @colliding_5 failures(propagate) (%arg0) : (!transform.any_op) -> ()
   }
 }
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-def-invalid.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-def-invalid.mlir
new file mode 100644
index 00000000000000..1d9ef1dbead63c
--- /dev/null
+++ b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-def-invalid.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-opt %s
+
+module attributes {transform.with_named_sequence} {
+  // expected-note @below {{previously defined here}}
+  transform.named_sequence @print_message(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "message" : !transform.any_op
+    transform.yield
+  }
+
+  transform.named_sequence @consuming(%arg0: !transform.any_op {transform.consumed}) {
+    transform.test_consume_operand %arg0 : !transform.any_op
+    transform.yield
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-def.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-def.mlir
index 1149bda98ab852..66f0f1f62683b7 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-def.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-def.mlir
@@ -1,11 +1,42 @@
 // RUN: mlir-opt %s
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) {
+  transform.named_sequence private @private_helper(%arg0: !transform.any_op {transform.readonly}) {
     transform.test_print_remark_at_operand %arg0, "message" : !transform.any_op
     transform.yield
   }
 
+  // These ops collide with ops from the other module before or after renaming.
+  transform.named_sequence private @colliding(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "external colliding (without suffix)" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence private @colliding_0(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "external colliding_0" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence private @colliding_2(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "external colliding_2" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence private @colliding_3(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "external colliding_3" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence private @colliding_4(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "external colliding_4" : !transform.any_op
+    transform.yield
+  }
+  transform.named_sequence @colliding_5(%arg0: !transform.any_op {transform.readonly}) {
+    transform.test_print_remark_at_operand %arg0, "external colliding_5" : !transform.any_op
+    transform.yield
+  }
+
+  transform.named_sequence @print_message(%arg0: !transform.any_op {transform.readonly}) {
+    transform.include @private_helper failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.yield
+  }
+
   transform.named_sequence @consuming(%arg0: !transform.any_op {transform.consumed}) {
     transform.test_consume_operand %arg0 : !transform.any_op
     transform.yield
@@ -15,4 +46,14 @@ module attributes {transform.with_named_sequence} {
     transform.test_print_remark_at_operand %arg0, "unannotated" : !transform.any_op
     transform.yield
   }
+
+  transform.named_sequence @symbol_user(%arg0: !transform.any_op {transform.readonly}) {
+    transform.include @colliding failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.include @colliding_0 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.include @colliding_2 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.include @colliding_3 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.include @colliding_4 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.include @colliding_5 failures(propagate) (%arg0) : (!transform.any_op) -> ()
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir
index daa179cb15408b..3891c16b411559 100644
--- a/mlir/test/Dialect/Transform/test-interpreter.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter.mlir
@@ -2037,3 +2037,75 @@ transform.sequence failures(propagate) {
   // expected-remark @below{{0}}
   test_print_number_of_associated_payload_ir_ops %empty_op : !transform.any_op
 }
+
+
+// -----
+
+func.func @no_constant_under_loop(%lb: index, %ub: index, %step: index) {
+  scf.for %i= %lb to %ub step %step {
+    arith.constant 0 : index
+  }
+  return
+}
+
+module @named_inclusion attributes { transform.with_named_sequence } {
+// Match `arith.constant`s that are not nested under a `scf.for` and ensure
+// there are none in the program
+
+transform.named_sequence @print(%root: !transform.any_op {transform.readonly}) {
+  transform.test_print_remark_at_operand %root, "matched func" : !transform.any_op
+  transform.yield 
+}
+
+transform.named_sequence @match_constant_not_under_scf_for(%root: !transform.any_op {transform.readonly}) 
+  -> !transform.any_op {
+  transform.match.operation_name %root ["arith.constant"] : !transform.any_op
+  %for = transform.get_parent_op %root { op_name = "scf.for", allow_empty_results }
+    : (!transform.any_op) -> (!transform.any_op)
+  transform.match.operation_empty %for : !transform.any_op
+  transform.yield %root : !transform.any_op
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !transform.any_op):
+  transform.foreach_match in %arg0
+      @match_constant_not_under_scf_for -> @print
+    : (!transform.any_op) -> (!transform.any_op)
+  transform.yield 
+}
+}
+
+// -----
+
+func.func @no_constant_under_loop(%lb: index, %ub: index, %step: index) {
+  // expected-remark @below {{no parent scf.for}}
+  arith.constant 0 : index
+  return
+}
+
+module @named_inclusion attributes { transform.with_named_sequence } {
+// Match `arith.constant`s that are not nested under a `scf.for` and ensure
+// there are none in the program
+
+transform.named_sequence @print(%root: !transform.any_op {transform.readonly}) {
+  transform.test_print_remark_at_operand %root, "no parent scf.for" : !transform.any_op
+  transform.yield 
+}
+
+transform.named_sequence @match_constant_not_under_scf_for(%root: !transform.any_op {transform.readonly}) 
+  -> !transform.any_op {
+  transform.match.operation_name %root ["arith.constant"] : !transform.any_op
+  %for = transform.get_parent_op %root { op_name = "scf.for", allow_empty_results }
+    : (!transform.any_op) -> (!transform.any_op)
+  transform.match.operation_empty %for : !transform.any_op
+  transform.yield %root : !transform.any_op
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !transform.any_op):
+  transform.foreach_match in %arg0
+      @match_constant_not_under_scf_for -> @print
+    : (!transform.any_op) -> (!transform.any_op)
+  transform.yield 
+}
+}
diff --git a/mlir/test/Dialect/Vector/sink-vector-broadcast.mlir b/mlir/test/Dialect/Vector/sink-vector-broadcast.mlir
index d9d2f44e6f16c1..e7863a9e8b7b78 100644
--- a/mlir/test/Dialect/Vector/sink-vector-broadcast.mlir
+++ b/mlir/test/Dialect/Vector/sink-vector-broadcast.mlir
@@ -105,3 +105,23 @@ func.func @broadcast_not_elementwise() -> vector<2x2xf32> {
 
   return %mm1 : vector<2x2xf32>
 }
+
+// CHECK-LABEL: func.func @dont_sink_cmp(
+//       CHECK:   %[[BROADCAST:.+]] = vector.broadcast
+//       CHECK:   %[[RETURN:.+]] = arith.cmpf uno, %[[BROADCAST]], %[[BROADCAST]]
+//       CHECK:   return %[[RETURN]]
+func.func @dont_sink_cmp(%arg0 : f32, %arg1 : vector<1xf32>) -> vector<1xi1> {
+  %0 = vector.broadcast %arg0 : f32 to vector<1xf32>
+  %1 = arith.cmpf uno, %0, %0 : vector<1xf32>
+  return %1 : vector<1xi1>
+}
+
+// CHECK-LABEL: func.func @dont_sink_fma(
+  //     CHECK:   %[[BROADCAST:.+]] = vector.broadcast
+  //     CHECK:   %[[RESULT:.+]] = vector.fma %[[BROADCAST]]
+  //     CHECK:   return %[[RESULT]]
+func.func @dont_sink_fma(%arg0 : f32) -> vector<1xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<1xf32>
+  %1 = vector.fma %0, %0, %0 : vector<1xf32>
+  return %1 : vector<1xf32>
+}
diff --git a/mlir/test/Examples/transform/ChH/full.mlir b/mlir/test/Examples/transform/ChH/full.mlir
index ed0509fcef6f12..d90d740b445312 100644
--- a/mlir/test/Examples/transform/ChH/full.mlir
+++ b/mlir/test/Examples/transform/ChH/full.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s --test-transform-dialect-interpreter \
 // RUN:             --test-transform-dialect-erase-schedule \
 // RUN:             --math-uplift-to-fma \
+// RUN:             --convert-bufferization-to-memref \
 // RUN:             --test-lower-to-llvm |\
 // RUN: FileCheck %s
 
@@ -307,10 +308,22 @@ module attributes { transform.with_named_sequence } {
     // transformation process, so invalidation is not an issue. However, if
     // other transformations, such as loop unrolling, are required after
     // bufferization, new handles should be produced using the match operations.
+    //
+    // One-shot bufferization itself does not produce buffer deallocations,
+    // which may lead to leaks. So we have to run the buffer deallocation pass
+    // pipeline to avoid them. Note that the transform dialect seamlessly runs
+    // named passes and pass pipelines: if desired, one could replace complex
+    // --pass-pipeline expressions with operations. Note that we apply the
+    // pipeline to functions rather than entire module to avoid running it
+    // on the transform IR that is contained in the module.
     %arg1 = transform.bufferization.one_shot_bufferize %arg0 {
       bufferize_function_boundaries = true,
       function_boundary_type_conversion = 1 : i32 }
       : (!transform.any_op) -> !transform.any_op
+    %f = transform.structured.match ops{["func.func"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "buffer-deallocation-pipeline" to %f
+      : (!transform.any_op) -> !transform.any_op
 
     // Apply general canonicalization and CSE to each function after
     // bufferization as new simplification opportunities may have appeared.
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_libgen.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_libgen.mlir
index 05c4900fb58f78..6540c950ab675b 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_libgen.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_libgen.mlir
@@ -42,7 +42,7 @@
   crdWidth = 32
 }>
 
-// TODO: "compressed_hi" is not supported by libgen path.
+// TODO: "loose_compressed" is not supported by libgen path.
 // #BCOO = #sparse_tensor.encoding<{
 //   map = (d0, d1, d2) -> (d0 : dense, d1 : compressed(nonunique, high), d2 : singleton)
 //}>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir
index 394b9a8448b543..8df0f320825198 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir
@@ -111,7 +111,7 @@ module {
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesTo(%y1, %c5, %c7, %c4, %c9, %c7)
       : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort_coo quick_sort %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
+    sparse_tensor.sort quick_sort %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
       : memref<?xi32> jointly memref<?xi32>
     // Dumps memory in the same order as the perm_map such that the output is ordered.
     %x1v = vector.transfer_read %x1[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
@@ -140,7 +140,7 @@ module {
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesTo(%y1, %c5, %c7, %c4, %c9, %c7)
       : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort_coo insertion_sort_stable %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
+    sparse_tensor.sort insertion_sort_stable %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
       : memref<?xi32> jointly memref<?xi32>
     %x1v2 = vector.transfer_read %x1[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %x1v2 : vector<5xi32>
@@ -168,7 +168,7 @@ module {
       : (memref<?xi32, strided<[4], offset: ?>>, i32, i32, i32, i32, i32) -> ()
     call @storeValuesTo(%y1, %c5, %c7, %c4, %c9, %c7)
       : (memref<?xi32>, i32, i32, i32, i32, i32) -> ()
-    sparse_tensor.sort_coo heap_sort %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
+    sparse_tensor.sort heap_sort %i5, %xy jointly %y1 {perm_map = #ID_MAP, ny = 1 : index}
       : memref<?xi32> jointly memref<?xi32>
     %x1v3 = vector.transfer_read %x1[%i0], %c100: memref<?xi32, strided<[4], offset: ?>>, vector<5xi32>
     vector.print %x1v3 : vector<5xi32>
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index aa11773defdb15..2ad39405cc06f4 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -1,25 +1,11 @@
 // RUN: mlir-opt %s \
-// RUN:    -convert-nvgpu-to-nvvm \
-// RUN:    -gpu-kernel-outlining \
-// RUN:    -convert-vector-to-scf  \
-// RUN:    -convert-scf-to-cf \
-// RUN:    -convert-nvvm-to-llvm \
-// RUN:    -convert-vector-to-llvm \
-// RUN:    -convert-index-to-llvm=index-bitwidth=32 \
-// RUN:    -convert-arith-to-llvm \
-// RUN:    -finalize-memref-to-llvm='use-opaque-pointers=1' \
-// RUN:    -convert-func-to-llvm \
-// RUN:    -canonicalize -cse \
-// RUN:    -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
-// RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
 // RUN:  | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 // RUN:   --entry-point-result=void \
 // RUN:  | FileCheck %s
 
-
 // Test swizzling with TMA load
 // 128B Swizzle Each numbered cell is 16 byte 
 // |-------------------------------|
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 5c465f7de8abdb..242c5ff875cf44 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -1,19 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:    -convert-nvgpu-to-nvvm \
-// RUN:    -canonicalize -cse \
-// RUN:    -gpu-kernel-outlining \
-// RUN:    -convert-vector-to-scf  \
-// RUN:    -convert-scf-to-cf \
-// RUN:    -convert-nvvm-to-llvm \
-// RUN:    -convert-vector-to-llvm \
-// RUN:    -convert-index-to-llvm=index-bitwidth=32 \
-// RUN:    -convert-arith-to-llvm \
-// RUN:    -finalize-memref-to-llvm='use-opaque-pointers=1' \
-// RUN:    -convert-func-to-llvm \
-// RUN:    -canonicalize -cse \
-// RUN:    -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
-// RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
+// RUN:  -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
 // RUN:  | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 5331ebb87d37de..44b127bd409ba6 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -1,16 +1,10 @@
-// RUN: mlir-opt %s --convert-nvgpu-to-nvvm \
-// RUN:         -gpu-kernel-outlining \
-// RUN:         -convert-nvvm-to-llvm \
-// RUN:         -convert-scf-to-cf  \
-// RUN:         -convert-vector-to-llvm \
-// RUN:         -convert-index-to-llvm=index-bitwidth=32 \
-// RUN:         -convert-arith-to-llvm \
-// RUN:         -finalize-memref-to-llvm='use-opaque-pointers=1' \
-// RUN:         -convert-func-to-llvm \
-// RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
-// RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
-// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
+// RUN: mlir-opt %s \
+// RUN:  -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
+// RUN:  | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN:  | FileCheck %s
 
 // Basic PTX check to make sure we are generating the right instructions.
 
diff --git a/mlir/test/Pass/action-profiler.mlir b/mlir/test/Pass/action-profiler.mlir
new file mode 100644
index 00000000000000..7735041d03c4a9
--- /dev/null
+++ b/mlir/test/Pass/action-profiler.mlir
@@ -0,0 +1,6 @@
+// RUN: mlir-opt %s --profile-actions-to=- -test-stats-pass -test-module-pass | FileCheck %s
+
+// CHECK: {"name": "pass-execution", "cat": "PERF", "ph": "B", "pid": 0, "tid": {{[0-9]+}}, "ts": {{[0-9]+}}, "args": {"desc": "`pass-execution` running `{{.*}}TestStatisticPass` on Operation `builtin.module`"}},
+// CHECK-NEXT: {"name": "pass-execution", "cat": "PERF", "ph": "E", "pid": 0, "tid": {{[0-9]+}}, "ts": {{[0-9]+}}},
+// CHECK-NEXT: {"name": "pass-execution", "cat": "PERF", "ph": "B", "pid": 0, "tid": {{[0-9]+}}, "ts": {{[0-9]+}}, "args": {"desc": "`pass-execution` running `{{.*}}TestModulePass` on Operation `builtin.module`"}},
+// CHECK-NEXT: {"name": "pass-execution", "cat": "PERF", "ph": "E", "pid": 0, "tid": {{[0-9]+}}, "ts": {{[0-9]+}}}
diff --git a/mlir/test/Target/LLVMIR/openmp-teams.mlir b/mlir/test/Target/LLVMIR/openmp-teams.mlir
new file mode 100644
index 00000000000000..16457e88774b93
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-teams.mlir
@@ -0,0 +1,136 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+llvm.func @foo()
+
+// CHECK-LABEL: @omp_teams_simple
+// CHECK: call void {{.*}} @__kmpc_fork_teams(ptr @{{.+}}, i32 0, ptr [[WRAPPER_FN:.+]])
+// CHECK: ret void
+llvm.func @omp_teams_simple() {
+    omp.teams {
+        llvm.call @foo() : () -> ()
+        omp.terminator
+    }
+    llvm.return
+}
+
+// CHECK: define internal void @[[OUTLINED_FN:.+]]()
+// CHECK:   call void @foo()
+// CHECK:   ret void
+// CHECK: define void [[WRAPPER_FN]](ptr {{.+}}, ptr {{.+}})
+// CHECK:   call void @[[OUTLINED_FN]]
+// CHECK:   ret void
+
+// -----
+
+llvm.func @foo(i32) -> ()
+
+// CHECK-LABEL: @omp_teams_shared_simple
+// CHECK-SAME: (i32 [[ARG0:%.+]])
+// CHECK: [[STRUCT_ARG:%.+]] = alloca { i32 }
+// CHECK: br
+// CHECK: [[GEP:%.+]] = getelementptr { i32 }, ptr [[STRUCT_ARG]], i32 0, i32 0
+// CHECK: store i32 [[ARG0]], ptr [[GEP]]
+// CHECK: call void {{.+}} @__kmpc_fork_teams(ptr @{{.+}}, i32 1, ptr [[WRAPPER_FN:.+]], ptr [[STRUCT_ARG]])
+// CHECK: ret void
+llvm.func @omp_teams_shared_simple(%arg0: i32) {
+    omp.teams {
+        llvm.call @foo(%arg0) : (i32) -> ()
+        omp.terminator
+    }
+    llvm.return
+}
+
+// CHECK: define internal void [[OUTLINED_FN:@.+]](ptr [[STRUCT_ARG:%.+]])
+// CHECK:   [[GEP:%.+]] = getelementptr { i32 }, ptr [[STRUCT_ARG]], i32 0, i32 0
+// CHECK:   [[LOAD_GEP:%.+]] = load i32, ptr [[GEP]]
+// CHECK:   call void @foo(i32 [[LOAD_GEP]])
+// CHECK:   ret void
+// CHECK: define void [[WRAPPER_FN]](ptr {{.+}}, ptr {{.+}}, ptr [[STRUCT_ARG:.+]])
+// CHECK:   call void [[OUTLINED_FN]](ptr [[STRUCT_ARG]])
+// CHECK:   ret void
+
+// -----
+
+llvm.func @my_alloca_fn() -> !llvm.ptr<i32>
+llvm.func @foo(i32, f32, !llvm.ptr<i32>, f128, !llvm.ptr<i32>, i32) -> ()
+llvm.func @bar()
+
+// CHECK-LABEL: @omp_teams_branching_shared
+// CHECK-SAME: (i1 [[CONDITION:%.+]], i32 [[ARG0:%.+]], float [[ARG1:%.+]], ptr [[ARG2:%.+]], fp128 [[ARG3:%.+]])
+
+// Checking that the allocation for struct argument happens in the alloca block.
+// CHECK: [[STRUCT_ARG:%.+]] = alloca { i1, i32, float, ptr, fp128, ptr, i32 }
+// CHECK: [[ALLOCATED:%.+]] = call ptr @my_alloca_fn()
+// CHECK: [[LOADED:%.+]] = load i32, ptr [[ALLOCATED]]
+// CHECK: br label
+
+// Checking that the shared values are stored properly in the struct arg.
+// CHECK: [[CONDITION_PTR:%.+]] = getelementptr {{.+}}, ptr [[STRUCT_ARG]]
+// CHECK: store i1 [[CONDITION]], ptr [[CONDITION_PTR]]
+// CHECK: [[ARG0_PTR:%.+]] = getelementptr {{.+}}, ptr [[STRUCT_ARG]], i32 0, i32 1
+// CHECK: store i32 [[ARG0]], ptr [[ARG0_PTR]]
+// CHECK: [[ARG1_PTR:%.+]] = getelementptr {{.+}}, ptr [[STRUCT_ARG]], i32 0, i32 2
+// CHECK: store float [[ARG1]], ptr [[ARG1_PTR]]
+// CHECK: [[ARG2_PTR:%.+]] = getelementptr {{.+}}, ptr [[STRUCT_ARG]], i32 0, i32 3
+// CHECK: store ptr [[ARG2]], ptr [[ARG2_PTR]]
+// CHECK: [[ARG3_PTR:%.+]] = getelementptr {{.+}}, ptr [[STRUCT_ARG]], i32 0, i32 4
+// CHECK: store fp128 [[ARG3]], ptr [[ARG3_PTR]]
+// CHECK: [[ALLOCATED_PTR:%.+]] = getelementptr {{.+}}, ptr [[STRUCT_ARG]], i32 0, i32 5
+// CHECK: store ptr [[ALLOCATED]], ptr [[ALLOCATED_PTR]]
+// CHECK: [[LOADED_PTR:%.+]] = getelementptr {{.+}}, ptr [[STRUCT_ARG]], i32 0, i32 6
+// CHECK: store i32 [[LOADED]], ptr [[LOADED_PTR]]
+
+// Runtime call.
+// CHECK: call void {{.+}} @__kmpc_fork_teams(ptr @{{.+}}, i32 1, ptr [[WRAPPER_FN:@.+]], ptr [[STRUCT_ARG]])
+// CHECK: br label
+// CHECK: call void @bar()
+// CHECK: ret void
+llvm.func @omp_teams_branching_shared(%condition: i1, %arg0: i32, %arg1: f32, %arg2: !llvm.ptr<i32>, %arg3: f128) {
+    %allocated = llvm.call @my_alloca_fn(): () -> !llvm.ptr<i32>
+    %loaded = llvm.load %allocated : !llvm.ptr<i32>
+    llvm.br ^codegenBlock
+^codegenBlock:
+    omp.teams {
+        llvm.cond_br %condition, ^true_block, ^false_block
+    ^true_block:
+        llvm.call @foo(%arg0, %arg1, %arg2, %arg3, %allocated, %loaded) : (i32, f32, !llvm.ptr<i32>, f128, !llvm.ptr<i32>, i32) -> ()
+        llvm.br ^exit
+    ^false_block:
+        llvm.br ^exit
+    ^exit:
+        omp.terminator
+    }
+    llvm.call @bar() : () -> ()
+    llvm.return
+}
+
+// Check the outlined function.
+// CHECK: define internal void [[OUTLINED_FN:@.+]](ptr [[DATA:%.+]])
+// CHECK:   [[CONDITION_PTR:%.+]] = getelementptr {{.+}}, ptr [[DATA]]
+// CHECK:   [[CONDITION:%.+]] = load i1, ptr [[CONDITION_PTR]]
+// CHECK:   [[ARG0_PTR:%.+]] = getelementptr {{.+}}, ptr [[DATA]], i32 0, i32 1
+// CHECK:   [[ARG0:%.+]] = load i32, ptr [[ARG0_PTR]]
+// CHECK:   [[ARG1_PTR:%.+]] = getelementptr {{.+}}, ptr [[DATA]], i32 0, i32 2
+// CHECK:   [[ARG1:%.+]] = load float, ptr [[ARG1_PTR]]
+// CHECK:   [[ARG2_PTR:%.+]] = getelementptr {{.+}}, ptr [[DATA]], i32 0, i32 3
+// CHECK:   [[ARG2:%.+]] = load ptr, ptr [[ARG2_PTR]]
+// CHECK:   [[ARG3_PTR:%.+]] = getelementptr {{.+}}, ptr [[DATA]], i32 0, i32 4
+// CHECK:   [[ARG3:%.+]] = load fp128, ptr [[ARG3_PTR]]
+// CHECK:   [[ALLOCATED_PTR:%.+]] = getelementptr {{.+}}, ptr [[DATA]], i32 0, i32 5
+// CHECK:   [[ALLOCATED:%.+]] = load ptr, ptr [[ALLOCATED_PTR]]
+// CHECK:   [[LOADED_PTR:%.+]] = getelementptr {{.+}}, ptr [[DATA]], i32 0, i32 6
+// CHECK:   [[LOADED:%.+]] = load i32, ptr [[LOADED_PTR]]
+// CHECK:   br label
+
+// CHECK:   br i1 [[CONDITION]], label %[[TRUE:.+]], label %[[FALSE:.+]]
+// CHECK: [[FALSE]]:
+// CHECK-NEXT: br label
+// CHECK: [[TRUE]]:
+// CHECK:   call void @foo(i32 [[ARG0]], float [[ARG1]], ptr [[ARG2]], fp128 [[ARG3]], ptr [[ALLOCATED]], i32 [[LOADED]])
+// CHECK-NEXT: br label
+// CHECK: ret void
+
+// Check the wrapper function
+// CHECK: define void [[WRAPPER_FN]](ptr {{.+}}, ptr {{.+}}, ptr [[DATA:%.+]])
+// CHECK:   call void [[OUTLINED_FN]](ptr [[DATA]])
+// CHECK:   ret void
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index 328c803c39efb7..b5af22f23a77cb 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
+#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
@@ -74,6 +75,10 @@ struct TestLowerToNVVMOptions
       *this, "cubin-format",
       llvm::cl::desc("Compilation format to use to serialize to cubin."),
       llvm::cl::init("isa")};
+  PassOptions::Option<int> optLevel{
+      *this, "opt-level",
+      llvm::cl::desc("Optimization level for NVVM compilation"),
+      llvm::cl::init(2)};
 };
 
 //===----------------------------------------------------------------------===//
@@ -139,11 +144,6 @@ void buildGpuPassPipeline(OpPassManager &pm,
   pm.addNestedPass<gpu::GPUModuleOp>(
       createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
 
-  // TODO: C++20 designated initializers.
-  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
-  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
   pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
 
   // Convert vector to LLVM (always needed).
@@ -153,6 +153,9 @@ void buildGpuPassPipeline(OpPassManager &pm,
   pm.addNestedPass<gpu::GPUModuleOp>(
       createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
 
+  // This pass is needed for PTX building
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
+
   // Sprinkle some cleanups.
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
@@ -163,6 +166,20 @@ void buildGpuPassPipeline(OpPassManager &pm,
 
 void buildLowerToNVVMPassPipeline(OpPassManager &pm,
                                   const TestLowerToNVVMOptions &options) {
+  // Start with a cleanup pass.
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+
+  //===----------------------------------------------------------------------===//
+  // NVGPU lowers device code as well as host code to the driver, so must run
+  // before outlining.
+  //===----------------------------------------------------------------------===//
+  // TODO: C++20 designated initializers.
+  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
+  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
+  pm.addNestedPass<func::FuncOp>(
+      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
+
   //===----------------------------------------------------------------------===//
   // Host-specific stuff.
   //===----------------------------------------------------------------------===//
@@ -242,6 +259,7 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
   nvvmTargetOptions.triple = options.cubinTriple;
   nvvmTargetOptions.chip = options.cubinChip;
   nvvmTargetOptions.features = options.cubinFeatures;
+  nvvmTargetOptions.optLevel = options.optLevel;
   pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
 
   // Convert GPU to LLVM.
diff --git a/mlir/test/lib/Dialect/Test/TestAttrDefs.td b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
index ec0a5548a16033..945c54c04d47ce 100644
--- a/mlir/test/lib/Dialect/Test/TestAttrDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
@@ -323,5 +323,16 @@ def Test_IteratorTypeArrayAttr
     : TypedArrayAttrBase<Test_IteratorTypeEnum,
   "Iterator type should be an enum.">;
 
+def TestParamCopyCount : AttrParameter<"CopyCount", "", "const CopyCount &"> {}
+
+// Test overridding attribute builders with a custom builder.
+def TestCopyCount : Test_Attr<"TestCopyCount"> {
+  let mnemonic = "copy_count";
+  let parameters = (ins TestParamCopyCount:$copy_count);
+  let assemblyFormat = "`<` $copy_count `>`";
+}
+
+
+
 
 #endif // TEST_ATTRDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index 7fc2e6ab3ec0a0..c240354e5d9904 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/ADT/bit.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace test;
@@ -175,6 +176,45 @@ static void printTrueFalse(AsmPrinter &p, std::optional<int> result) {
   p << (*result ? "true" : "false");
 }
 
+//===----------------------------------------------------------------------===//
+// CopyCountAttr Implementation
+//===----------------------------------------------------------------------===//
+
+CopyCount::CopyCount(const CopyCount &rhs) : value(rhs.value) {
+  CopyCount::counter++;
+}
+
+CopyCount &CopyCount::operator=(const CopyCount &rhs) {
+  CopyCount::counter++;
+  value = rhs.value;
+  return *this;
+}
+
+int CopyCount::counter;
+
+static bool operator==(const test::CopyCount &lhs, const test::CopyCount &rhs) {
+  return lhs.value == rhs.value;
+}
+
+llvm::raw_ostream &test::operator<<(llvm::raw_ostream &os,
+                                    const test::CopyCount &value) {
+  return os << value.value;
+}
+
+template <>
+struct mlir::FieldParser<test::CopyCount> {
+  static FailureOr<test::CopyCount> parse(AsmParser &parser) {
+    std::string value;
+    if (parser.parseKeyword(value))
+      return failure();
+    return test::CopyCount(value);
+  }
+};
+namespace test {
+llvm::hash_code hash_value(const test::CopyCount &copyCount) {
+  return llvm::hash_value(copyCount.value);
+}
+} // namespace test
 //===----------------------------------------------------------------------===//
 // Tablegen Generated Definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.h b/mlir/test/lib/Dialect/Test/TestAttributes.h
index cc73e078bf7e20..ef6eae51fdd628 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.h
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.h
@@ -29,6 +29,19 @@
 
 namespace test {
 class TestDialect;
+// Payload class for the CopyCountAttr.
+class CopyCount {
+public:
+  CopyCount(std::string value) : value(value) {}
+  CopyCount(const CopyCount &rhs);
+  CopyCount &operator=(const CopyCount &rhs);
+  CopyCount(CopyCount &&rhs) = default;
+  CopyCount &operator=(CopyCount &&rhs) = default;
+  static int counter;
+  std::string value;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const test::CopyCount &value);
 
 /// A handle used to reference external elements instances.
 using TestDialectResourceBlobHandle =
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 3f1bb667a0aecb..8aa7774d2bf008 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -998,7 +998,7 @@ void LoopBlockOp::getSuccessorRegions(
 
 OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionBranchPoint point) {
   assert(point == getBody());
-  return getInitMutable();
+  return MutableOperandRange(getInitMutable());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
index f73deef9d5fd48..578d9abe4a56ec 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
@@ -218,9 +218,9 @@ class TestTransformDialectInterpreterPass
           "select the container of the top-level transform op.")};
   Option<std::string> transformLibraryFileName{
       *this, "transform-library-file-name", llvm::cl::init(""),
-      llvm::cl::desc(
-          "Optional name of the file containing transform dialect symbol "
-          "definitions to be injected into the transform module.")};
+      llvm::cl::desc("Optional name of a file with a module that should be "
+                     "merged into the transform module to provide the "
+                     "definitions of external named sequences.")};
 
   Option<bool> testModuleGeneration{
       *this, "test-module-generation", llvm::cl::init(false),
diff --git a/mlir/test/lib/Pass/TestPassManager.cpp b/mlir/test/lib/Pass/TestPassManager.cpp
index 3b24cf44e21108..477b75916f80c8 100644
--- a/mlir/test/lib/Pass/TestPassManager.cpp
+++ b/mlir/test/lib/Pass/TestPassManager.cpp
@@ -53,7 +53,7 @@ struct TestOptionsPass
     : public PassWrapper<TestOptionsPass, OperationPass<func::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPass)
 
-  enum Enum {};
+  enum Enum { One, Two };
 
   struct Options : public PassPipelineOptions<Options> {
     ListOption<int> listOption{*this, "list",
diff --git a/mlir/test/mlir-tblgen/attrdefs.td b/mlir/test/mlir-tblgen/attrdefs.td
index 020cd0ca65b691..d7228368e71eb0 100644
--- a/mlir/test/mlir-tblgen/attrdefs.td
+++ b/mlir/test/mlir-tblgen/attrdefs.td
@@ -82,7 +82,7 @@ def B_CompoundAttrA : TestAttr<"CompoundA"> {
 // Check that AttributeSelfTypeParameter is handled properly.
 // DEF-LABEL: struct CompoundAAttrStorage
 // DEF: CompoundAAttrStorage(
-// DEF-SAME: inner(inner)
+// DEF-SAME: inner(std::move(inner))
 
 // DEF: bool operator==(const KeyTy &tblgenKey) const {
 // DEF-NEXT: return
@@ -94,7 +94,7 @@ def B_CompoundAttrA : TestAttr<"CompoundA"> {
 
 // DEF: static CompoundAAttrStorage *construct
 // DEF: return new (allocator.allocate<CompoundAAttrStorage>())
-// DEF-SAME: CompoundAAttrStorage(widthOfSomething, exampleTdType, apFloat, dims, inner);
+// DEF-SAME: CompoundAAttrStorage(std::move(widthOfSomething), std::move(exampleTdType), std::move(apFloat), std::move(dims), std::move(inner));
 
 // DEF: ::mlir::Type CompoundAAttr::getInner() const {
 // DEF-NEXT: return getImpl()->inner;
diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td
index 2afde1abdb7266..85d68cc42ccbd7 100644
--- a/mlir/test/mlir-tblgen/op-decl-and-defs.td
+++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td
@@ -97,7 +97,7 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
 // CHECK:   ::mlir::Operation::operand_range getODSOperands(unsigned index);
 // CHECK:   ::mlir::TypedValue<::mlir::IntegerType> getA();
 // CHECK:   ::mlir::Operation::operand_range getB();
-// CHECK:   ::mlir::MutableOperandRange getAMutable();
+// CHECK:   ::mlir::OpOperand &getAMutable();
 // CHECK:   ::mlir::MutableOperandRange getBMutable();
 // CHECK:   ::mlir::Operation::result_range getODSResults(unsigned index);
 // CHECK:   ::mlir::TypedValue<::mlir::IntegerType> getR();
diff --git a/mlir/test/python/dialects/cf.py b/mlir/test/python/dialects/cf.py
new file mode 100644
index 00000000000000..469e74d44a7dae
--- /dev/null
+++ b/mlir/test/python/dialects/cf.py
@@ -0,0 +1,50 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+from mlir.dialects import cf
+
+
+def constructAndPrintInModule(f):
+    print("\nTEST:", f.__name__)
+    with Context() as ctx, Location.unknown():
+        ctx.allow_unregistered_dialects = True
+        module = Module.create()
+        with InsertionPoint(module.body):
+            f()
+    return f
+
+
+# CHECK-LABEL: TEST: testBranchAndSetSuccessor
+@constructAndPrintInModule
+def testBranchAndSetSuccessor():
+    op1 = Operation.create("custom.op1", regions=1)
+
+    block0 = op1.regions[0].blocks.append()
+    ip = InsertionPoint(block0)
+    Operation.create("custom.terminator", ip=ip)
+
+    block1 = op1.regions[0].blocks.append()
+    ip = InsertionPoint(block1)
+    br1 = cf.BranchOp([], block1, ip=ip)
+    # CHECK: ^bb1:  // pred: ^bb1
+    # CHECK:   cf.br ^bb1
+    print(br1.successors[0])
+    # CHECK: num_successors 1
+    print("num_successors", len(br1.successors))
+
+    block2 = op1.regions[0].blocks.append()
+    ip = InsertionPoint(block2)
+    br2 = cf.BranchOp([], block1, ip=ip)
+    # CHECK: ^bb1:  // 2 preds: ^bb1, ^bb2
+    # CHECK:   cf.br ^bb1
+    print(br2.successors[0])
+    # CHECK: num_successors 1
+    print("num_successors", len(br2.successors))
+
+    br1.successors[0] = block2
+    # CHECK: ^bb2:  // pred: ^bb1
+    # CHECK:   cf.br ^bb1
+    print(br1.successors[0])
+    # CHECK: ^bb1:  // pred: ^bb2
+    # CHECK:   cf.br ^bb2
+    print(br2.operation.successors[0])
diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py
index e1048edce184a5..d80b878323377a 100644
--- a/mlir/test/python/dialects/sparse_tensor/dialect.py
+++ b/mlir/test/python/dialects/sparse_tensor/dialect.py
@@ -21,7 +21,7 @@ def testEncodingAttr1D():
             "  crdWidth = 32"
             "}>"
         )
-        # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ], posWidth = 16, crdWidth = 32 }>
+        # CHECK: #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed), posWidth = 16, crdWidth = 32 }>
         print(parsed)
 
         casted = st.EncodingAttr(parsed)
@@ -38,7 +38,7 @@ def testEncodingAttr1D():
         print(f"crd_width: {casted.crd_width}")
 
         created = st.EncodingAttr.get(casted.lvl_types, None, 0, 0)
-        # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>
+        # CHECK: #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
         print(created)
         # CHECK: created_equal: False
         print(f"created_equal: {created == casted}")
@@ -61,7 +61,7 @@ def testEncodingAttr2D():
             "  crdWidth = 32"
             "}>"
         )
-        # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)>, posWidth = 8, crdWidth = 32 }>
+        # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d1 : dense, d0 : compressed), posWidth = 8, crdWidth = 32 }>
         print(parsed)
 
         casted = st.EncodingAttr(parsed)
@@ -77,10 +77,8 @@ def testEncodingAttr2D():
         # CHECK: crd_width: 32
         print(f"crd_width: {casted.crd_width}")
 
-        created = st.EncodingAttr.get(
-            casted.lvl_types, casted.dim_to_lvl, 8, 32
-        )
-        # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)>, posWidth = 8, crdWidth = 32 }>
+        created = st.EncodingAttr.get(casted.lvl_types, casted.dim_to_lvl, 8, 32)
+        # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d1 : dense, d0 : compressed), posWidth = 8, crdWidth = 32 }>
         print(created)
         # CHECK: created_equal: True
         print(f"created_equal: {created == casted}")
@@ -100,8 +98,8 @@ def testEncodingAttrOnTensorType():
             )
         )
         tt = RankedTensorType.get((1024,), F32Type.get(), encoding=encoding)
-        # CHECK: tensor<1024xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ], posWidth = 64, crdWidth = 32 }>>
+        # CHECK: tensor<1024xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed), posWidth = 64, crdWidth = 32 }>>
         print(tt)
-        # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ], posWidth = 64, crdWidth = 32 }>
+        # CHECK: #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed), posWidth = 64, crdWidth = 32 }>
         print(tt.encoding)
         assert tt.encoding == encoding
diff --git a/mlir/test/python/dialects/transform_nvgpu_ext.py b/mlir/test/python/dialects/transform_nvgpu_ext.py
new file mode 100644
index 00000000000000..1de8b25bab7a5c
--- /dev/null
+++ b/mlir/test/python/dialects/transform_nvgpu_ext.py
@@ -0,0 +1,27 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+from mlir.dialects import transform
+from mlir.dialects.transform import nvgpu
+
+
+def run(f):
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            print("\nTEST:", f.__name__)
+            f()
+        print(module)
+    return f
+
+
+@run
+def testCreateAsyncGroups():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
+    )
+    with InsertionPoint(sequence.body):
+        nvgpu.CreateAsyncGroupsOp(transform.AnyOpType.get(), sequence.bodyTarget)
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: testCreateAsyncGroups
+    # CHECK: transform.nvgpu.create_async_groups
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index 664167e4f6c347..5898b0f7d69e83 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -1029,13 +1029,13 @@ void {0}::regionBuilder(ImplicitLocOpBuilder &b,
       // {1}: attribute name
       // {2}: default type function name
       static const char attrDef[] = R"FMT(
-{0} {1}Val = {0}::{2};
-auto {1}Iter = llvm::find_if(attrs, [&](const NamedAttribute &attr) {{
-                              return attr.getName() == "{1}"; });
-if ({1}Iter != attrs.end()) {{
-  if (auto attr = llvm::dyn_cast<{0}Attr>({1}Iter->getValue()))
-    {1}Val = attr.getValue();
-}
+  {0} {1}Val = {0}::{2};
+  auto {1}Iter = llvm::find_if(attrs, [&](const NamedAttribute &attr) {{
+                                return attr.getName() == "{1}"; });
+  if ({1}Iter != attrs.end()) {{
+    if (auto attr = llvm::dyn_cast<{0}Attr>({1}Iter->getValue()))
+      {1}Val = attr.getValue();
+  }
 )FMT";
       std::string enumName = convertOperandKindToEnumName(arg.kind);
       attrs.push_back(
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index f6e43d42d29f06..51f24de2442b95 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -353,7 +353,7 @@ void DefGen::emitDefaultBuilder() {
   MethodBody &body = m->body().indent();
   auto scope = body.scope("return Base::get(context", ");");
   for (const auto &param : params)
-    body << ", " << param.getName();
+    body << ", std::move(" << param.getName() << ")";
 }
 
 void DefGen::emitCheckedBuilder() {
@@ -474,8 +474,10 @@ void DefGen::emitTraitMethod(const InterfaceMethod &method) {
 void DefGen::emitStorageConstructor() {
   Constructor *ctor =
       storageCls->addConstructor<Method::Inline>(getBuilderParams({}));
-  for (auto &param : params)
-    ctor->addMemberInitializer(param.getName(), param.getName());
+  for (auto &param : params) {
+    std::string movedValue = ("std::move(" + param.getName() + ")").str();
+    ctor->addMemberInitializer(param.getName(), movedValue);
+  }
 }
 
 void DefGen::emitKeyType() {
@@ -525,11 +527,11 @@ void DefGen::emitConstruct() {
                                         : Method::Static,
       MethodParameter(strfmt("::mlir::{0}StorageAllocator &", valueType),
                       "allocator"),
-      MethodParameter("const KeyTy &", "tblgenKey"));
+      MethodParameter("KeyTy &&", "tblgenKey"));
   if (!def.hasStorageCustomConstructor()) {
     auto &body = construct->body().indent();
     for (const auto &it : llvm::enumerate(params)) {
-      body << formatv("auto {0} = std::get<{1}>(tblgenKey);\n",
+      body << formatv("auto {0} = std::move(std::get<{1}>(tblgenKey));\n",
                       it.value().getName(), it.index());
     }
     // Use the parameters' custom allocator code, if provided.
@@ -544,8 +546,9 @@ void DefGen::emitConstruct() {
         body.scope(strfmt("return new (allocator.allocate<{0}>()) {0}(",
                           def.getStorageClassName()),
                    ");");
-    llvm::interleaveComma(params, body,
-                          [&](auto &param) { body << param.getName(); });
+    llvm::interleaveComma(params, body, [&](auto &param) {
+      body << "std::move(" << param.getName() << ")";
+    });
   }
 }
 
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 5f6a4e3bc52a84..7029c0eac15c39 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -2071,14 +2071,26 @@ void OpEmitter::genNamedOperandSetters() {
       continue;
     std::string name = op.getGetterName(operand.name);
 
-    auto *m = opClass.addMethod(operand.isVariadicOfVariadic()
-                                    ? "::mlir::MutableOperandRangeRange"
-                                    : "::mlir::MutableOperandRange",
-                                name + "Mutable");
+    StringRef returnType;
+    if (operand.isVariadicOfVariadic()) {
+      returnType = "::mlir::MutableOperandRangeRange";
+    } else if (operand.isVariableLength()) {
+      returnType = "::mlir::MutableOperandRange";
+    } else {
+      returnType = "::mlir::OpOperand &";
+    }
+    auto *m = opClass.addMethod(returnType, name + "Mutable");
     ERROR_IF_PRUNED(m, name, op);
     auto &body = m->body();
-    body << "  auto range = getODSOperandIndexAndLength(" << i << ");\n"
-         << "  auto mutableRange = "
+    body << "  auto range = getODSOperandIndexAndLength(" << i << ");\n";
+
+    if (!operand.isVariadicOfVariadic() && !operand.isVariableLength()) {
+      // In case of a single operand, return a single OpOperand.
+      body << "  return getOperation()->getOpOperand(range.first);\n";
+      continue;
+    }
+
+    body << "  auto mutableRange = "
             "::mlir::MutableOperandRange(getOperation(), "
             "range.first, range.second";
     if (attrSizedOperands) {
diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp
index 4ec90101f48c85..db748b475a7ac6 100644
--- a/mlir/unittests/Bytecode/BytecodeTest.cpp
+++ b/mlir/unittests/Bytecode/BytecodeTest.cpp
@@ -21,8 +21,6 @@
 using namespace llvm;
 using namespace mlir;
 
-using ::testing::StartsWith;
-
 StringLiteral IRWithResources = R"(
 module @TestDialectResources attributes {
   bytecode.test = dense_resource<resource> : tensor<4xi32>
@@ -66,8 +64,7 @@ TEST(Bytecode, MultiModuleWithResource) {
 
   // FIXME: Parsing external resources does not work on big-endian
   // platforms currently.
-  if (llvm::support::endian::system_endianness() ==
-      llvm::support::endianness::big)
+  if (llvm::endianness::native == llvm::endianness::big)
     GTEST_SKIP();
 
   // Try to see if we have a valid resource in the parsed module.
diff --git a/mlir/unittests/IR/AttributeTest.cpp b/mlir/unittests/IR/AttributeTest.cpp
index 9afbce037b408c..20167825d4ca82 100644
--- a/mlir/unittests/IR/AttributeTest.cpp
+++ b/mlir/unittests/IR/AttributeTest.cpp
@@ -13,6 +13,8 @@
 #include "gtest/gtest.h"
 #include <optional>
 
+#include "../../test/lib/Dialect/Test/TestDialect.h"
+
 using namespace mlir;
 using namespace mlir::detail;
 
@@ -459,4 +461,26 @@ TEST(SubElementTest, Nested) {
             ArrayRef<Attribute>(
                 {strAttr, trueAttr, falseAttr, boolArrayAttr, dictAttr}));
 }
+
+// Test how many times we call copy-ctor when building an attribute.
+TEST(CopyCountAttr, CopyCount) {
+  MLIRContext context;
+  context.loadDialect<test::TestDialect>();
+
+  test::CopyCount::counter = 0;
+  test::CopyCount copyCount("hello");
+  test::TestCopyCountAttr::get(&context, std::move(copyCount));
+  int counter1 = test::CopyCount::counter;
+  test::CopyCount::counter = 0;
+  test::TestCopyCountAttr::get(&context, std::move(copyCount));
+#ifndef NDEBUG
+  // One verification enabled only in assert-mode requires a copy.
+  EXPECT_EQ(counter1, 1);
+  EXPECT_EQ(test::CopyCount::counter, 1);
+#else
+  EXPECT_EQ(counter1, 0);
+  EXPECT_EQ(test::CopyCount::counter, 0);
+#endif
+}
+
 } // namespace
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 844107b3dade59..5f592f46c8ffa5 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -95,6 +95,7 @@ if("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND (LIBC_GPU_BUILD OR
                                             LIBC_GPU_ARCHITECTURES))
   set(LIBC_GPU_SUPPORT TRUE)
 endif()
+
 set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LIBC_GPU_SUPPORT} CACHE BOOL
     "Libomptarget support for the GPU libc")
 pythonize_bool(LIBOMPTARGET_GPU_LIBC_SUPPORT)
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index 5deee9c53926e7..809c5f03886b04 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -20,10 +20,10 @@ using namespace ompx;
 
 #pragma omp begin declare target device_type(nohost)
 
-// defined by CGOpenMPRuntimeGPU
-extern uint32_t __omp_rtl_debug_kind;
-extern uint32_t __omp_rtl_assume_no_thread_state;
-extern uint32_t __omp_rtl_assume_no_nested_parallelism;
+// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
+[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
+[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
+[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0;
 
 // This variable should be visibile to the plugin so we override the default
 // hidden visibility.
diff --git a/openmp/libomptarget/DeviceRTL/src/exports b/openmp/libomptarget/DeviceRTL/src/exports
index 2d13195aa7dc87..fbcda3ce8f555c 100644
--- a/openmp/libomptarget/DeviceRTL/src/exports
+++ b/openmp/libomptarget/DeviceRTL/src/exports
@@ -3,6 +3,10 @@ ompx_*
 *llvm_*
 __kmpc_*
 
+__omp_rtl_debug_kind
+__omp_rtl_assume_no_thread_state
+__omp_rtl_assume_no_nested_parallelism
+
 _ZN4ompx*
 
 IsSPMDMode
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 5f21b16b3fbfb1..0b4a393405a4fc 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -358,7 +358,7 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
   assert(Rc == OFFLOAD_SUCCESS &&
          "__tgt_activate_record_replay unexpected failure!");
   return OMP_TGT_SUCCESS;
-};
+}
 
 /// Implements a target kernel entry that replays a pre-recorded kernel.
 /// \param Loc Source location associated with this target region (unused).
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index a53d4e32436de1..80a3c10d3a949f 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -128,6 +128,7 @@ elif config.operating_system == 'Darwin':
     config.test_flags += " -Wl,-rpath," + config.library_dir
     config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
 else: # Unices
+    config.test_flags += " -nogpulib"
     config.test_flags += " -Wl,-rpath," + config.library_dir
     config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
     config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory
@@ -151,6 +152,12 @@ def remove_suffix_if_present(name):
     else:
         return name
 
+def add_libraries(source):
+    if config.libomptarget_has_libc:
+        return source + " " + config.llvm_library_dir + "/libcgpu.a " + \
+               config.library_dir + "/libomptarget.devicertl.a"
+    return source + " " + config.library_dir + "/libomptarget.devicertl.a"
+
 # substitutions
 # - for targets that exist in the system create the actual command.
 # - for valid targets that do not exist in the system, return false, so that the
@@ -229,12 +236,10 @@ for libomptarget_target in config.libomptarget_all_targets:
             "%libomptarget-run-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compilexx-" + \
             libomptarget_target, \
-            "%clangxx-" + libomptarget_target + " %s -o %t" + \
-            (" -lcgpu" if config.libomptarget_has_libc else "")))
+            "%clangxx-" + libomptarget_target + add_libraries(" %s -o %t")))
         config.substitutions.append(("%libomptarget-compile-" + \
             libomptarget_target, \
-            "%clang-" + libomptarget_target + " %s -o %t" +
-            (" -lcgpu" if config.libomptarget_has_libc else "")))
+            "%clang-" + libomptarget_target + add_libraries(" %s -o %t")))
         config.substitutions.append(("%libomptarget-compile-fortran-" + \
             libomptarget_target, \
             "%flang-" + libomptarget_target + " %s -o %t"))
@@ -256,12 +261,10 @@ for libomptarget_target in config.libomptarget_all_targets:
             "%libomptarget-run-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compileoptxx-" + \
             libomptarget_target, \
-            "%clangxx-" + libomptarget_target + " -O3 %s -o %t" +
-            (" -lcgpu" if config.libomptarget_has_libc else "")))
+            "%clangxx-" + libomptarget_target + add_libraries(" -O3 %s -o %t")))
         config.substitutions.append(("%libomptarget-compileopt-" + \
             libomptarget_target, \
-            "%clang-" + libomptarget_target + " -O3 %s -o %t" +
-            (" -lcgpu" if config.libomptarget_has_libc else "")))
+            "%clang-" + libomptarget_target + add_libraries(" -O3 %s -o %t")))
         config.substitutions.append(("%libomptarget-run-" + \
             libomptarget_target, \
             "%t"))
diff --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in
index 9ee60d6082b77e..2c6b90ad46019b 100644
--- a/openmp/libomptarget/test/lit.site.cfg.in
+++ b/openmp/libomptarget/test/lit.site.cfg.in
@@ -12,6 +12,7 @@ config.cuda_libdir = "@CUDA_LIBDIR@"
 config.cuda_test_arch = "@LIBOMPTARGET_DEP_CUDA_ARCH@"
 config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@/@CURRENT_TARGET@"
 config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
+config.llvm_library_dir = "@LIBOMPTARGET_LLVM_LIBRARY_DIR@"
 config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
 config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
 config.llvm_lib_directory = "@LIBOMPTARGET_LLVM_LIBRARY_DIR@"
diff --git a/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90 b/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
new file mode 100644
index 00000000000000..b4c793ca06cf79
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
@@ -0,0 +1,43 @@
+! Offloading test with two target regions mapping the same
+! declare target Fortran array and writing some values to 
+! it before checking the host correctly receives the 
+! correct updates made on the device.
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+module test_0
+    implicit none
+    integer :: sp(10) = (/0,0,0,0,0,0,0,0,0,0/)
+    !$omp declare target link(sp)
+end module test_0
+
+program main
+    use test_0
+    integer :: i = 1
+    integer :: j = 11
+
+!$omp target map(tofrom:sp) map(to: i, j)
+    do while (i <= j)
+        sp(i) = i;
+        i = i + 1
+    end do
+!$omp end target
+
+!$omp target map(tofrom:sp) map(to: i, j)
+    do while (i <= j)
+        sp(i) = sp(i) + i;
+        i = i + 1
+    end do
+!$omp end target
+    
+print *, sp(:)
+
+end program
+
+! CHECK: 2 4 6 8 10 12 14 16 18 20
diff --git a/openmp/libomptarget/test/offloading/std_complex_arithmetic.cpp b/openmp/libomptarget/test/offloading/std_complex_arithmetic.cpp
index b06d605d29f90c..44f87ad732bfbf 100644
--- a/openmp/libomptarget/test/offloading/std_complex_arithmetic.cpp
+++ b/openmp/libomptarget/test/offloading/std_complex_arithmetic.cpp
@@ -3,6 +3,10 @@
 // RUN: %libomptarget-compilexx-generic -O3 -ffast-math && \
 // RUN:   %libomptarget-run-generic
 
+// FIXME: This fails to link due to missing math symbols. We should provide the
+//        needed math functions in the GPU `libm` and require the GPU C library.
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
 #include <cassert>
 #include <complex>
 #include <iostream>
diff --git a/openmp/libomptarget/test/offloading/weak.c b/openmp/libomptarget/test/offloading/weak.c
new file mode 100644
index 00000000000000..ca81db958356b2
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/weak.c
@@ -0,0 +1,33 @@
+// RUN: %libomptarget-compile-generic -DA -c -o %t-a.o
+// RUN: %libomptarget-compile-generic -DB -c -o %t-b.o
+// RUN: %libomptarget-compile-generic %t-a.o %t-b.o && \
+// RUN:   %libomptarget-run-generic | %fcheck-generic
+
+#if defined(A)
+__attribute__((weak)) int x = 999;
+#pragma omp declare target to(x)
+#elif defined(B)
+int x = 42;
+#pragma omp declare target to(x)
+__attribute__((weak)) int y = 42;
+#pragma omp declare target to(y)
+#else
+
+#include <stdio.h>
+
+extern int x;
+#pragma omp declare target to(x)
+extern int y;
+#pragma omp declare target to(y)
+
+int main() {
+  x = 0;
+
+#pragma omp target update from(x)
+#pragma omp target update from(y)
+
+  // CHECK: PASS
+  if (x == 42 && y == 42)
+    printf("PASS\n");
+}
+#endif
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index c26a5f725d59f5..010ec879e44a32 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -19,7 +19,7 @@ list(INSERT CMAKE_MODULE_PATH 0
 
 # We order libraries to mirror roughly how they are layered, except that compiler-rt can depend
 # on libc++, so we put it after.
-set(LLVM_DEFAULT_RUNTIMES "libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp;flang-rt")
+set(LLVM_DEFAULT_RUNTIMES "libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp")
 set(LLVM_SUPPORTED_RUNTIMES "${LLVM_DEFAULT_RUNTIMES};llvm-libgcc")
 set(LLVM_ENABLE_RUNTIMES "" CACHE STRING
   "Semicolon-separated list of runtimes to build, or \"all\" (${LLVM_DEFAULT_RUNTIMES}). Supported runtimes are ${LLVM_SUPPORTED_RUNTIMES}.")
diff --git a/runtimes/cmake/Modules/HandleLitArguments.cmake b/runtimes/cmake/Modules/HandleLitArguments.cmake
new file mode 100644
index 00000000000000..ccd5dcce4f5bc7
--- /dev/null
+++ b/runtimes/cmake/Modules/HandleLitArguments.cmake
@@ -0,0 +1,20 @@
+
+macro(serialize_lit_param output_var param value)
+  string(APPEND ${output_var} "config.${param} = ${value}\n")
+endmacro()
+
+macro(serialize_lit_string_param output_var param value)
+  # Ensure that all quotes in the value are escaped for a valid python string.
+  string(REPLACE "\"" "\\\"" _escaped_value "${value}")
+  string(APPEND ${output_var} "config.${param} = \"${_escaped_value}\"\n")
+endmacro()
+
+macro(serialize_lit_params_list output_var list)
+  foreach(param IN LISTS ${list})
+    string(FIND "${param}" "=" _eq_index)
+    string(SUBSTRING "${param}" 0 ${_eq_index} name)
+    string(SUBSTRING "${param}" ${_eq_index} -1 value)
+    string(SUBSTRING "${value}" 1 -1 value) # strip the leading =
+    serialize_lit_string_param("${output_var}" "${name}" "${value}")
+  endforeach()
+endmacro()
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 258ae5168887fe..6fc4be39b063f0 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -11,7 +11,7 @@ load(
 )
 load(":platforms.bzl", "PLATFORM_CPU_ARM64", "PLATFORM_CPU_X86_64")
 load("@bazel_skylib//lib:selects.bzl", "selects")
-load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "string_flag")
 load("//:vars.bzl", "LLVM_VERSION_MAJOR", "LLVM_VERSION_MINOR", "LLVM_VERSION_PATCH")
 
 package(
@@ -35,8 +35,6 @@ MEMORY_COPTS = [
     # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
 ]
 
-llvm_libc_namespace = "__llvm_libc_{}_{}_{}_git".format(LLVM_VERSION_MAJOR, LLVM_VERSION_MINOR, LLVM_VERSION_PATCH)
-
 # A flag to pick which `mpfr` to use for math tests.
 # Usage: `--@llvm-project//libc:mpfr=<disable|external|system>`.
 # Flag documentation: https://bazel.build/extending/config
@@ -65,6 +63,23 @@ config_setting(
     flag_values = {":mpfr": "system"},
 )
 
+default_libc_namespace = "__llvm_libc_{}_{}_{}_git".format(LLVM_VERSION_MAJOR, LLVM_VERSION_MINOR, LLVM_VERSION_PATCH)
+
+release_libc_namespace = "__llvm_libc"
+
+# When set, The ':libc_root' target below will define 'LIBC_NAMESPACE' to
+# 'release_libc_namespace' instead of 'default_libc_namespace'.
+# Usage: `--@llvm-project//libc:release`.
+bool_flag(
+    name = "release",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "use_release_namespace",
+    flag_values = {":release": "true"},
+)
+
 # This empty root library helps us add an include path to this directory
 # using the 'includes' attribute. The strings listed in the includes attribute
 # are relative paths wrt this library but are inherited by the dependents
@@ -72,7 +87,10 @@ config_setting(
 # paths of the kind "../../" to other libc targets.
 cc_library(
     name = "libc_root",
-    defines = ["LIBC_NAMESPACE=" + llvm_libc_namespace],
+    defines = select({
+        ":use_release_namespace": ["LIBC_NAMESPACE=" + release_libc_namespace],
+        "//conditions:default": ["LIBC_NAMESPACE=" + default_libc_namespace],
+    }),
     includes = ["."],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f1a377588f895a..53b626996f8bbf 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1887,21 +1887,16 @@ gentbl_cc_library(
 )
 
 gentbl_cc_library(
-    name = "ArmSMEConversionIncGen",
+    name = "ArmSMEOpsIncGen",
     tbl_outs = [
         (
-            ["-gen-llvmir-conversions"],
-            "include/mlir/Dialect/ArmSME/IR/ArmSMEConversions.inc",
+            ["-gen-op-decls"],
+            "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.cpp.inc",
         ),
-    ],
-    tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/ArmSME/IR/ArmSME.td",
-    deps = [":ArmSMETdFiles"],
-)
-
-gentbl_cc_library(
-    name = "ArmSMEAttrDefsIncGen",
-    tbl_outs = [
         (
             ["-gen-enum-decls"],
             "include/mlir/Dialect/ArmSME/IR/ArmSMEEnums.h.inc",
@@ -1926,7 +1921,41 @@ gentbl_cc_library(
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/ArmSME/IR/ArmSME.td",
+    td_file = "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td",
+    deps = [":ArmSMETdFiles"],
+)
+
+gentbl_cc_library(
+    name = "ArmSMEConversionIncGen",
+    tbl_outs = [
+        (
+            ["-gen-llvmir-conversions"],
+            "include/mlir/Dialect/ArmSME/IR/ArmSMEOpsConversions.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td",
+    deps = [":ArmSMETdFiles"],
+)
+
+gentbl_cc_library(
+    name = "ArmSMEIntrinsicOpsIncGen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.cpp.inc",
+        ),
+        (
+            ["-gen-llvmir-conversions"],
+            "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicConversions.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td",
     deps = [":ArmSMETdFiles"],
 )
 
@@ -1937,7 +1966,8 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArmSMEIncGen",
-        ":ArmSMEAttrDefsIncGen",
+        ":ArmSMEIntrinsicOpsIncGen",
+        ":ArmSMEOpsIncGen",
         ":IR",
         ":LLVMDialect",
         ":MemRefDialect",
@@ -3052,6 +3082,7 @@ cc_library(
     deps = [
         ":GPUDialect",
         ":IR",
+        ":LLVMDialect",
         ":NVGPUIncGen",
         ":SideEffectInterfaces",
         "//llvm:Core",
@@ -4484,6 +4515,7 @@ cc_library(
     deps = [
         ":CAPIIR",
         ":IR",
+        ":Rewrite",
         ":Support",
         "//llvm:Support",
     ],
@@ -5269,6 +5301,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":ArithDialect",
         ":ConversionPassIncGen",
         ":GPUCommonTransforms",
         ":GPUDialect",
@@ -7922,6 +7955,7 @@ cc_library(
     deps = [
         ":ArmSMEConversionIncGen",
         ":ArmSMEDialect",
+        ":ArmSMEIntrinsicOpsIncGen",
         ":IR",
         ":ToLLVMIRTranslation",
         "//llvm:Core",
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index b8b70f4685feae..949bfa4fbda3a6 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -1209,6 +1209,25 @@ gentbl_filegroup(
     ],
 )
 
+gentbl_filegroup(
+    name = "NVGPUTransformOpsPyGen",
+    tbl_outs = [
+        (
+            [
+                "-gen-python-op-bindings",
+                "-bind-dialect=transform",
+                "-dialect-extension=nvgpu_transform",
+            ],
+            "mlir/dialects/_nvgpu_transform_ops_gen.py",
+        ),
+    ],
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "mlir/dialects/NVGPUTransformOps.td",
+    deps = [
+        "//mlir:NVGPUTransformOpsTdFiles",
+    ],
+)
+
 gentbl_filegroup(
     name = "PDLTransformOpsPyGen",
     tbl_outs = [
@@ -1327,6 +1346,7 @@ filegroup(
         ":GPUTransformOpsPyGen",
         ":LoopTransformOpsPyGen",
         ":MemRefTransformOpsPyGen",
+        ":NVGPUTransformOpsPyGen",
         ":PDLTransformOpsPyGen",
         ":SparseTensorTransformOpsPyGen",
         ":StructureTransformEnumPyGen",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 192c9d156e5781..e43d09618a2cf6 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -3,9 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("//llvm:lit_test.bzl", "package_path")
 load("//mlir:build_defs.bzl", "if_cuda_available")
 load("//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//llvm:lit_test.bzl", "package_path")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -664,6 +664,7 @@ cc_library(
         "//mlir:MemRefToLLVM",
         "//mlir:MemRefTransforms",
         "//mlir:NVGPUToNVVM",
+        "//mlir:NVVMToLLVM",
         "//mlir:NVVMToLLVMIRTranslation",
         "//mlir:Pass",
         "//mlir:ROCDLToLLVMIRTranslation",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
index c4feae055cec8e..1df8ae802e6d29 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
@@ -29,6 +29,7 @@ package(default_visibility = ["//visibility:public"])
             "Transform/*-source.mlir",
             "Transform/*-symbol-def.mlir",
             "Transform/*-symbol-decl-and-schedule.mlir",
+            "Transform/*-symbol-decl-invalid.mlir",
             "Transform/test-repro-dump.mlir",
         ],
     )